data_increase.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # 导入常用包
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. from PIL import Image
  6. from model_saver import save_model
  7. # 机器学习模型导入
  8. from sklearn.ensemble import RandomForestRegressor
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.metrics import mean_squared_error
  11. from sklearn.ensemble import GradientBoostingRegressor as GBSTR
  12. from sklearn.neighbors import KNeighborsRegressor
  13. from xgboost import XGBRegressor as XGBR
  14. # 导入数据处理函数
  15. from sklearn.preprocessing import StandardScaler
  16. from sklearn.preprocessing import MinMaxScaler
  17. # 导入评分函数
  18. from sklearn.metrics import r2_score
  19. from sklearn.metrics import mean_squared_error
  20. from sklearn.metrics import mean_absolute_error
  21. from sklearn.metrics import accuracy_score
  22. from sklearn.metrics import log_loss
  23. from sklearn.metrics import roc_auc_score
  24. import pickle
  25. import pandas as pd
  26. import numpy as np
  27. from sklearn.metrics import mean_squared_error
  28. from pathlib import Path
  29. # ## 土壤反酸筛选数据 64个样本 9个特征(包含delta_ph) 105_day_ph
  30. # data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  31. # x = data.iloc[:,1:10]
  32. # print(x)
  33. # y = data.iloc[:,-1]
  34. # print(y)
  35. # # 为 x 赋予列名
  36. # x.columns = [
  37. # 'organic_matter', # OM g/kg
  38. # 'chloride', # CL g/kg
  39. # 'cec', # CEC cmol/kg
  40. # 'h_concentration', # H+ cmol/kg
  41. # 'hn', # HN mg/kg
  42. # 'al_concentration', # Al3+ cmol/kg
  43. # 'free_alumina', # Free alumina g/kg
  44. # 'free_iron', # Free iron oxides g/kg
  45. # 'delta_ph' # ΔpH
  46. # ]
  47. # y.name = 'target_ph'
  48. # ## 土壤反酸筛选数据 64个样本 8个特征 delta_ph
  49. # # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # 64个样本
  50. # data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx') # 60个样本(去除异常点)
  51. # x = data.iloc[:,1:9]
  52. # print(x)
  53. # y = data.iloc[:,-2]
  54. # print(y)
  55. # # 为 x 赋予列名
  56. # x.columns = [
  57. # 'organic_matter', # OM g/kg
  58. # 'chloride', # CL g/kg
  59. # 'cec', # CEC cmol/kg
  60. # 'h_concentration', # H+ cmol/kg
  61. # 'hn', # HN mg/kg
  62. # 'al_concentration', # Al3+ cmol/kg
  63. # 'free_alumina', # Free alumina g/kg
  64. # 'free_iron', # Free iron oxides g/kg
  65. # ]
  66. # y.name = 'target_ph'
  67. ## 土壤反酸筛选数据 最新数据:34个样本 6个特征 delta_ph
  68. data=pd.read_excel('model_optimize\data\data_reflux2.xlsx')
  69. x = data.iloc[:,0:6]
  70. print(x)
  71. y = data.iloc[:,-1]
  72. print(y)
  73. # 为 x 赋予列名
  74. x.columns = [
  75. 'organic_matter', # OM g/kg
  76. 'chloride', # CL g/kg
  77. 'cec', # CEC cmol/kg
  78. 'h_concentration', # H+ cmol/kg
  79. 'hn', # HN mg/kg ....
  80. 'al_concentration', # Al3+ cmol/kg
  81. ]
  82. y.name = 'delta_ph'
  83. # ## 精准降酸数据 54个样本 5个特征 1/b
  84. # data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
  85. # x = data.iloc[:,1:]
  86. # y = data.iloc[:,0]
  87. # # 为 x 赋予列名
  88. # x.columns = [
  89. # 'pH',
  90. # 'OM',
  91. # 'CL',
  92. # 'H',
  93. # 'Al'
  94. # ]
  95. # y.name = 'target'
  96. ## 数据集划分
  97. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  98. ## 模型
  99. # 随机森林回归模型
  100. rfc = RandomForestRegressor(random_state=1)
  101. # XGBR
  102. XGB = XGBR(random_state=1)
  103. # GBSTR
  104. GBST = GBSTR(random_state=1)
  105. # KNN
  106. KNN = KNeighborsRegressor(n_neighbors=2)
  107. # 增量训练:每次增加10%的训练数据
  108. increment = 0.1 # 每次增加的比例
  109. train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小
  110. r2_scores_rfc = []
  111. r2_scores_xgb = []
  112. r2_scores_gbst = []
  113. r2_scores_knn = [] # 用来记录KNN的r2_score
  114. # 对于每种训练集大小,训练模型并记录r2_score
  115. for size in train_sizes:
  116. # 计算当前训练集的大小
  117. current_size = int(size * len(Xtrain))
  118. # RandomForestRegressor
  119. rfc.fit(Xtrain[:current_size], Ytrain[:current_size])
  120. # XGBRegressor
  121. XGB.fit(Xtrain[:current_size], Ytrain[:current_size])
  122. # GBST
  123. GBST.fit(Xtrain[:current_size], Ytrain[:current_size])
  124. # KNN
  125. KNN.fit(Xtrain[:current_size], Ytrain[:current_size])
  126. # r2_score
  127. y_pred_rfc = rfc.predict(Xtest)
  128. score_rfc = r2_score(Ytest, y_pred_rfc)
  129. r2_scores_rfc.append(score_rfc)
  130. # XGBRegressor的r2_score
  131. y_pred_xgb = XGB.predict(Xtest)
  132. score_xgb = r2_score(Ytest, y_pred_xgb)
  133. r2_scores_xgb.append(score_xgb)
  134. # GBST的r2_score
  135. y_pred_gbst = GBST.predict(Xtest)
  136. score_gbst = r2_score(Ytest, y_pred_gbst)
  137. r2_scores_gbst.append(score_gbst)
  138. # KNN的r2_score
  139. y_pred_knn = KNN.predict(Xtest)
  140. score_knn = r2_score(Ytest, y_pred_knn)
  141. r2_scores_knn.append(score_knn)
  142. # 输出当前的训练进度与r2评分
  143. print(f"Training with {size * 100:.2f}% of the data:")
  144. print(f" - Random Forest R2 score: {score_rfc}")
  145. print(f" - XGB R2 score: {score_xgb}")
  146. print(f" - GBST R2 score: {score_gbst}")
  147. print(f" - KNN R2 score: {score_knn}")
  148. # 绘制R2评分随训练数据大小变化的图形
  149. import matplotlib.pyplot as plt
  150. plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest')
  151. plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost')
  152. plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting')
  153. # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN')
  154. plt.xlabel('Training data size (%)')
  155. plt.ylabel('R2 Score')
  156. plt.title('Model Performance with Incremental Data')
  157. plt.legend()
  158. plt.grid(True)
  159. plt.show()
  160. # # 打印JavaScript中需要的数据格式
  161. # print("X轴数据(训练数据大小):", [f"{int(size * 100)}%" for size in train_sizes])
  162. # print("Random Forest R2分数:", r2_scores_rfc)
  163. # print("XGBoost R2分数:", r2_scores_xgb)
  164. # print("Gradient Boosting R2分数:", r2_scores_gbst)
  165. y_pred = rfc.predict(Xtest) # 使用任意一个模型,这里以随机森林为例
  166. residuals = Ytest - y_pred
  167. plt.scatter(Ytest, y_pred, color='blue', alpha=0.5)
  168. plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], color='red', linestyle='--') # 对角线 y=x
  169. plt.xlabel('True Values')
  170. plt.ylabel('Predicted Values')
  171. plt.title('True vs Predicted Values')
  172. plt.grid(True)
  173. plt.show()
  174. # # 生成 scatterData
  175. # scatter_data = [[float(true), float(pred)] for true, pred in zip(Ytest, y_pred)]
  176. # # 打印 scatterData(可直接复制到 JavaScript 代码中)
  177. # print("scatterData = ", scatter_data)
  178. # 保存 X_test 和 Y_test 为 CSV 文件
  179. X_test_df = pd.DataFrame(Xtest)
  180. Y_test_df = pd.DataFrame(Ytest)
  181. # 将 X_test 和 Y_test 保存为 CSV 文件,方便之后加载
  182. X_test_df.to_csv('X_test_reflux.csv', index=False)
  183. Y_test_df.to_csv('Y_test_reflux.csv', index=False)
  184. # 输出提示信息
  185. print("X_test 和 Y_test 已保存为 'X_test_reduce.csv' 和 'Y_test_reduce.csv'")
  186. # 选择后半部分数据
  187. # import matplotlib.pyplot as plt
  188. # # 选择后半部分数据
  189. # half_index = len(train_sizes) // 2
  190. # # 绘制后半部分的数据
  191. # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest')
  192. # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost')
  193. # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting')
  194. # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN')
  195. # plt.xlabel('Training data size (%)')
  196. # plt.ylabel('R2 Score')
  197. # plt.title('Model Performance with Incremental Data (Second Half)')
  198. # plt.legend()
  199. # plt.grid(True)
  200. # plt.show()