data_increase.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. # 导入常用包
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. from PIL import Image
  6. from model_saver import save_model
  7. # 机器学习模型导入
  8. from sklearn.ensemble import RandomForestRegressor
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.metrics import mean_squared_error
  11. from sklearn.ensemble import GradientBoostingRegressor as GBSTR
  12. from sklearn.neighbors import KNeighborsRegressor
  13. from xgboost import XGBRegressor as XGBR
  14. # 导入数据处理函数
  15. from sklearn.preprocessing import StandardScaler
  16. from sklearn.preprocessing import MinMaxScaler
  17. # 导入评分函数
  18. from sklearn.metrics import r2_score
  19. from sklearn.metrics import mean_squared_error
  20. from sklearn.metrics import mean_absolute_error
  21. from sklearn.metrics import accuracy_score
  22. from sklearn.metrics import log_loss
  23. from sklearn.metrics import roc_auc_score
  24. import pickle
  25. import pandas as pd
  26. import numpy as np
  27. from sklearn.metrics import mean_squared_error
  28. from pathlib import Path
  29. ## 土壤反酸筛选数据
  30. data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  31. x = data.iloc[:,1:10]
  32. print(x)
  33. # x = data.iloc[:,1:9]
  34. y = data.iloc[:,-1]
  35. y1 = data.iloc[:,-2]
  36. y2 = data.iloc[:,-1]
  37. # 绝对值
  38. y = abs(y1 - y2)
  39. print(y)
  40. # 为 x 赋予列名
  41. x.columns = [
  42. 'organic_matter', # OM g/kg
  43. 'chloride', # CL g/kg
  44. 'cec', # CEC cmol/kg
  45. 'h_concentration', # H+ cmol/kg
  46. 'hn', # HN mg/kg
  47. 'al_concentration', # Al3+ cmol/kg
  48. 'free_alumina', # Free alumina g/kg
  49. 'free_iron', # Free iron oxides g/kg
  50. 'delta_ph' # ΔpH
  51. ]
  52. y.name = 'target_ph'
  53. # ## 精准降酸数据
  54. # data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
  55. # x = data.iloc[:,1:]
  56. # y = data.iloc[:,0]
  57. # # 为 x 赋予列名
  58. # x.columns = [
  59. # 'pH',
  60. # 'OM',
  61. # 'CL',
  62. # 'H',
  63. # 'Al'
  64. # ]
  65. # y.name = 'target'
  66. ## 数据集划分
  67. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  68. ## 模型
  69. # 随机森林回归模型
  70. rfc = RandomForestRegressor(random_state=1)
  71. # XGBR
  72. XGB = XGBR(random_state=1)
  73. # GBSTR
  74. GBST = GBSTR(random_state=1)
  75. # KNN
  76. KNN = KNeighborsRegressor(n_neighbors=4)
  77. # 增量训练:每次增加10%的训练数据
  78. increment = 0.1 # 每次增加的比例
  79. train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小
  80. r2_scores_rfc = []
  81. r2_scores_xgb = []
  82. r2_scores_gbst = []
  83. r2_scores_knn = [] # 用来记录KNN的r2_score
  84. # 对于每种训练集大小,训练模型并记录r2_score
  85. for size in train_sizes:
  86. # 计算当前训练集的大小
  87. current_size = int(size * len(Xtrain))
  88. # RandomForestRegressor
  89. rfc.fit(Xtrain[:current_size], Ytrain[:current_size])
  90. # XGBRegressor
  91. XGB.fit(Xtrain[:current_size], Ytrain[:current_size])
  92. # GBST
  93. GBST.fit(Xtrain[:current_size], Ytrain[:current_size])
  94. # KNN
  95. KNN.fit(Xtrain[:current_size], Ytrain[:current_size])
  96. # r2_score
  97. y_pred_rfc = rfc.predict(Xtest)
  98. score_rfc = r2_score(Ytest, y_pred_rfc)
  99. r2_scores_rfc.append(score_rfc)
  100. # XGBRegressor的r2_score
  101. y_pred_xgb = XGB.predict(Xtest)
  102. score_xgb = r2_score(Ytest, y_pred_xgb)
  103. r2_scores_xgb.append(score_xgb)
  104. # GBST的r2_score
  105. y_pred_gbst = GBST.predict(Xtest)
  106. score_gbst = r2_score(Ytest, y_pred_gbst)
  107. r2_scores_gbst.append(score_gbst)
  108. # KNN的r2_score
  109. y_pred_knn = KNN.predict(Xtest)
  110. score_knn = r2_score(Ytest, y_pred_knn)
  111. r2_scores_knn.append(score_knn)
  112. # 输出当前的训练进度与r2评分
  113. print(f"Training with {size * 100:.2f}% of the data:")
  114. print(f" - Random Forest R2 score: {score_rfc}")
  115. print(f" - XGB R2 score: {score_xgb}")
  116. print(f" - GBST R2 score: {score_gbst}")
  117. print(f" - KNN R2 score: {score_knn}")
  118. # 绘制R2评分随训练数据大小变化的图形
  119. import matplotlib.pyplot as plt
  120. plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest')
  121. plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost')
  122. plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting')
  123. # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN')
  124. plt.xlabel('Training data size (%)')
  125. plt.ylabel('R2 Score')
  126. plt.title('Model Performance with Incremental Data')
  127. plt.legend()
  128. plt.grid(True)
  129. plt.show()
  130. # 选择后半部分数据
  131. # import matplotlib.pyplot as plt
  132. # # 选择后半部分数据
  133. # half_index = len(train_sizes) // 2
  134. # # 绘制后半部分的数据
  135. # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest')
  136. # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost')
  137. # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting')
  138. # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN')
  139. # plt.xlabel('Training data size (%)')
  140. # plt.ylabel('R2 Score')
  141. # plt.title('Model Performance with Incremental Data (Second Half)')
  142. # plt.legend()
  143. # plt.grid(True)
  144. # plt.show()