data_increase.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. # 导入常用包
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. # 机器学习模型导入
  6. from sklearn.ensemble import RandomForestRegressor
  7. from sklearn.model_selection import train_test_split
  8. from sklearn.metrics import mean_squared_error
  9. from sklearn.ensemble import GradientBoostingRegressor as GBSTR
  10. from sklearn.neighbors import KNeighborsRegressor
  11. from xgboost import XGBRegressor as XGBR
  12. # 导入数据处理函数
  13. from sklearn.preprocessing import StandardScaler
  14. from sklearn.preprocessing import MinMaxScaler
  15. # 导入评分函数
  16. from sklearn.metrics import r2_score
  17. from sklearn.metrics import mean_squared_error
  18. from sklearn.metrics import mean_absolute_error
  19. from sklearn.metrics import accuracy_score
  20. from sklearn.metrics import log_loss
  21. from sklearn.metrics import roc_auc_score
  22. import pickle
  23. from pathlib import Path
  24. # 定义数据集配置
  25. DATASET_CONFIGS = {
  26. # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph
  27. 'soil_acid_9features': {
  28. 'file_path': 'model_optimize/data/data_filt.xlsx',
  29. 'x_columns': range(1, 10), # 9个特征(包含delta_ph)
  30. 'y_column': -1, # 105_day_ph
  31. 'feature_names': [
  32. 'organic_matter', # OM g/kg
  33. 'chloride', # CL g/kg
  34. 'cec', # CEC cmol/kg
  35. 'h_concentration', # H+ cmol/kg
  36. 'hn', # HN mg/kg
  37. 'al_concentration', # Al3+ cmol/kg
  38. 'free_alumina', # Free alumina g/kg
  39. 'free_iron', # Free iron oxides g/kg
  40. 'delta_ph' # ΔpH
  41. ],
  42. 'target_name': 'target_ph'
  43. },
  44. # 土壤反酸数据:64个样本,8个特征,目标 delta_ph
  45. 'soil_acid_8features': {
  46. 'file_path': 'model_optimize/data/data_filt - 副本.xlsx',
  47. 'x_columns': range(1, 9), # 8个特征
  48. 'y_column': -2, # delta_ph
  49. 'feature_names': [
  50. 'organic_matter', # OM g/kg
  51. 'chloride', # CL g/kg
  52. 'cec', # CEC cmol/kg
  53. 'h_concentration', # H+ cmol/kg
  54. 'hn', # HN mg/kg
  55. 'al_concentration', # Al3+ cmol/kg
  56. 'free_alumina', # Free alumina g/kg
  57. 'free_iron', # Free iron oxides g/kg
  58. ],
  59. 'target_name': 'target_ph'
  60. },
  61. # 土壤反酸数据:64个样本,8个特征,目标 delta_ph
  62. 'soil_acid_8features_original': {
  63. 'file_path': 'model_optimize/data/data_filt.xlsx',
  64. 'x_columns': range(1, 9), # 8个特征
  65. 'y_column': -2, # delta_ph
  66. 'feature_names': [
  67. 'organic_matter', # OM g/kg
  68. 'chloride', # CL g/kg
  69. 'cec', # CEC cmol/kg
  70. 'h_concentration', # H+ cmol/kg
  71. 'hn', # HN mg/kg
  72. 'al_concentration', # Al3+ cmol/kg
  73. 'free_alumina', # Free alumina g/kg
  74. 'free_iron', # Free iron oxides g/kg
  75. ],
  76. 'target_name': 'target_ph'
  77. },
  78. # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph
  79. 'soil_acid_6features': {
  80. 'file_path': 'model_optimize/data/data_reflux2.xlsx',
  81. 'x_columns': range(0, 6), # 6个特征
  82. 'y_column': -1, # delta_ph
  83. 'feature_names': [
  84. 'organic_matter', # OM g/kg
  85. 'chloride', # CL g/kg
  86. 'cec', # CEC cmol/kg
  87. 'h_concentration', # H+ cmol/kg
  88. 'hn', # HN mg/kg
  89. 'al_concentration', # Al3+ cmol/kg
  90. ],
  91. 'target_name': 'delta_ph'
  92. },
  93. # 精准降酸数据:54个样本,5个特征,目标是1/b
  94. 'acidity_reduce': {
  95. 'file_path': 'model_optimize/data/Acidity_reduce.xlsx',
  96. 'x_columns': range(1, 6), # 5个特征
  97. 'y_column': 0, # 1/b
  98. 'feature_names': [
  99. 'pH',
  100. 'OM',
  101. 'CL',
  102. 'H',
  103. 'Al'
  104. ],
  105. 'target_name': 'target'
  106. },
  107. # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b
  108. 'acidity_reduce_new': {
  109. 'file_path': 'model_optimize/data/Acidity_reduce_new.xlsx',
  110. 'x_columns': range(1, 6), # 5个特征
  111. 'y_column': 0, # 1/b
  112. 'feature_names': [
  113. 'pH',
  114. 'OM',
  115. 'CL',
  116. 'H',
  117. 'Al'
  118. ],
  119. 'target_name': 'target'
  120. }
  121. }
  122. def load_dataset(dataset_name):
  123. """
  124. 加载指定的数据集
  125. Args:
  126. dataset_name: 数据集配置名称
  127. Returns:
  128. x: 特征数据
  129. y: 目标数据
  130. """
  131. if dataset_name not in DATASET_CONFIGS:
  132. raise ValueError(f"未知的数据集名称: {dataset_name}")
  133. config = DATASET_CONFIGS[dataset_name]
  134. data = pd.read_excel(config['file_path'])
  135. x = data.iloc[:, config['x_columns']]
  136. y = data.iloc[:, config['y_column']]
  137. # 设置列名
  138. x.columns = config['feature_names']
  139. y.name = config['target_name']
  140. return x, y
  141. # 选择要使用的数据集
  142. # dataset_name = 'soil_acid_9features' # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph
  143. # dataset_name = 'soil_acid_8features_original' # 土壤反酸数据:64个样本,8个特征,目标 delta_ph
  144. dataset_name = 'soil_acid_8features' # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph
  145. # dataset_name = 'soil_acid_6features' # 土壤反酸数据:34个样本,6个特征,目标 delta_ph
  146. # dataset_name = 'acidity_reduce' # 精准降酸数据:54个样本,5个特征,目标是1/b
  147. # dataset_name = 'acidity_reduce_new' # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b
  148. x, y = load_dataset(dataset_name)
  149. print("特征数据:")
  150. print(x)
  151. print("\n目标数据:")
  152. print(y)
  153. ## 数据集划分
  154. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  155. ## 模型
  156. # 随机森林回归模型
  157. rfc = RandomForestRegressor(random_state=1)
  158. # XGBR
  159. XGB = XGBR(random_state=1)
  160. # GBSTR
  161. GBST = GBSTR(random_state=1)
  162. # KNN
  163. KNN = KNeighborsRegressor(n_neighbors=2)
  164. # 增量训练:每次增加10%的训练数据
  165. increment = 0.1 # 每次增加的比例
  166. train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小
  167. r2_scores_rfc = []
  168. r2_scores_xgb = []
  169. r2_scores_gbst = []
  170. r2_scores_knn = [] # 用来记录KNN的r2_score
  171. # 对于每种训练集大小,训练模型并记录r2_score
  172. for size in train_sizes[:10]:
  173. # 计算当前训练集的大小
  174. current_size = int(size * len(Xtrain))
  175. # RandomForestRegressor
  176. rfc.fit(Xtrain[:current_size], Ytrain[:current_size])
  177. # XGBRegressor
  178. XGB.fit(Xtrain[:current_size], Ytrain[:current_size])
  179. # GBST
  180. GBST.fit(Xtrain[:current_size], Ytrain[:current_size])
  181. # KNN
  182. KNN.fit(Xtrain[:current_size], Ytrain[:current_size])
  183. # r2_score
  184. y_pred_rfc = rfc.predict(Xtest)
  185. score_rfc = r2_score(Ytest, y_pred_rfc)
  186. r2_scores_rfc.append(score_rfc)
  187. # XGBRegressor的r2_score
  188. y_pred_xgb = XGB.predict(Xtest)
  189. score_xgb = r2_score(Ytest, y_pred_xgb)
  190. r2_scores_xgb.append(score_xgb)
  191. # GBST的r2_score
  192. y_pred_gbst = GBST.predict(Xtest)
  193. score_gbst = r2_score(Ytest, y_pred_gbst)
  194. r2_scores_gbst.append(score_gbst)
  195. # KNN的r2_score
  196. y_pred_knn = KNN.predict(Xtest)
  197. score_knn = r2_score(Ytest, y_pred_knn)
  198. r2_scores_knn.append(score_knn)
  199. # 输出当前的训练进度与r2评分
  200. print(f"Training with {size * 100:.2f}% of the data:")
  201. print(f" - Random Forest R2 score: {score_rfc}")
  202. print(f" - XGB R2 score: {score_xgb}")
  203. print(f" - GBST R2 score: {score_gbst}")
  204. print(f" - KNN R2 score: {score_knn}")
  205. # 绘制R2评分随训练数据大小变化的图形
  206. import matplotlib.pyplot as plt
  207. plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest')
  208. plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost')
  209. plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting')
  210. # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN')
  211. plt.xlabel('Training data size (%)')
  212. plt.ylabel('R2 Score')
  213. plt.title('Model Performance with Incremental Data')
  214. plt.legend()
  215. plt.grid(True)
  216. plt.show()
  217. # 打印JavaScript中需要的数据格式
  218. print("X轴数据(训练数据大小):", [f"{int(size * 100)}%" for size in train_sizes])
  219. print("Random Forest R2分数:", r2_scores_rfc)
  220. print("XGBoost R2分数:", r2_scores_xgb)
  221. print("Gradient Boosting R2分数:", r2_scores_gbst)
  222. y_pred = rfc.predict(Xtest) # 使用任意一个模型,这里以随机森林为例
  223. residuals = Ytest - y_pred
  224. plt.scatter(Ytest, y_pred, color='blue', alpha=0.5)
  225. plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], color='red', linestyle='--') # 对角线 y=x
  226. plt.xlabel('True Values')
  227. plt.ylabel('Predicted Values')
  228. plt.title('True vs Predicted Values')
  229. plt.grid(True)
  230. plt.show()
  231. # 生成 scatterData
  232. scatter_data = [[float(true), float(pred)] for true, pred in zip(Ytest, y_pred)]
  233. # 打印 scatterData(可直接复制到 JavaScript 代码中)
  234. print("scatterData = ", scatter_data)
  235. # # 保存 X_test 和 Y_test 为 CSV 文件
  236. # X_test_df = pd.DataFrame(Xtest)
  237. # Y_test_df = pd.DataFrame(Ytest)
  238. # # 将 X_test 和 Y_test 保存为 CSV 文件,方便之后加载
  239. # X_test_df.to_csv('X_test_reflux.csv', index=False)
  240. # Y_test_df.to_csv('Y_test_reflux.csv', index=False)
  241. # # 输出提示信息
  242. # print("X_test 和 Y_test 已保存为 'X_test_reduce.csv' 和 'Y_test_reduce.csv'")
  243. # 选择后半部分数据
  244. # import matplotlib.pyplot as plt
  245. # # 选择后半部分数据
  246. # half_index = len(train_sizes) // 2
  247. # # 绘制后半部分的数据
  248. # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest')
  249. # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost')
  250. # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting')
  251. # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN')
  252. # plt.xlabel('Training data size (%)')
  253. # plt.ylabel('R2 Score')
  254. # plt.title('Model Performance with Incremental Data (Second Half)')
  255. # plt.legend()
  256. # plt.grid(True)
  257. # plt.show()