data_increase_5cv_score.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # 导入常用包
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. from PIL import Image
  6. from model_saver import save_model
  7. # 机器学习模型导入
  8. from sklearn.ensemble import RandomForestRegressor
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.metrics import mean_squared_error
  11. from sklearn.ensemble import GradientBoostingRegressor as GBSTR
  12. from sklearn.neighbors import KNeighborsRegressor
  13. from xgboost import XGBRegressor as XGBR
  14. # 导入数据处理函数
  15. from sklearn.preprocessing import StandardScaler
  16. from sklearn.preprocessing import MinMaxScaler
  17. # 导入评分函数
  18. from sklearn.metrics import r2_score
  19. from sklearn.metrics import mean_squared_error
  20. from sklearn.metrics import mean_absolute_error
  21. from sklearn.metrics import accuracy_score
  22. from sklearn.metrics import log_loss
  23. from sklearn.metrics import roc_auc_score
  24. from sklearn.model_selection import cross_val_score
  25. import pickle
  26. import pandas as pd
  27. import numpy as np
  28. from sklearn.metrics import mean_squared_error
  29. from pathlib import Path
  30. # 导入数据
  31. # data = pd.read_excel('model_optimize/data/data_filt.xlsx')
  32. data = pd.read_excel('data/data_filt.xlsx')
  33. x = data.iloc[:, 1:10]
  34. y = data.iloc[:, -1]
  35. # 为 x 赋予列名
  36. x.columns = [
  37. 'organic_matter', # OM g/kg
  38. 'chloride', # CL g/kg
  39. 'cec', # CEC cmol/kg
  40. 'h_concentration', # H+ cmol/kg
  41. 'hn', # HN mg/kg
  42. 'al_concentration', # Al3+ cmol/kg
  43. 'free_alumina', # Free alumina g/kg
  44. 'free_iron', # Free iron oxides g/kg
  45. 'delta_ph' # ΔpH
  46. ]
  47. y.name = 'target_ph'
  48. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  49. ## 模型
  50. # 随机森林回归模型
  51. rfc = RandomForestRegressor(random_state=1)
  52. # XGBR
  53. XGB = XGBR(random_state=1)
  54. # GBSTR
  55. GBST = GBSTR(random_state=1)
  56. # KNN
  57. KNN = KNeighborsRegressor(n_neighbors=5)
  58. # 增量训练:每次增加10%的训练数据
  59. increment = 0.1 # 每次增加的比例
  60. train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小
  61. # 记录各模型交叉验证评分
  62. cv5_scores_rfc = []
  63. cv5_scores_xgb = []
  64. cv5_scores_gbst = []
  65. cv5_scores_knn = []
  66. # 对于每种训练集大小,训练模型并记录r2_score
  67. for size in train_sizes:
  68. # 计算当前训练集的大小
  69. current_size = int(size * len(Xtrain))
  70. # 交叉验证评分
  71. score_rfc = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
  72. cv5_scores_rfc.append(score_rfc)
  73. # XGBRegressor的交叉验证评分
  74. score_xgb = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  75. cv5_scores_xgb.append(score_xgb)
  76. # GBST的交叉验证评分
  77. score_gbst = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  78. cv5_scores_gbst.append(score_gbst)
  79. # KNN的交叉验证评分
  80. score_knn = cross_val_score(KNN, Xtrain, Ytrain, cv=5).mean()
  81. cv5_scores_knn.append(score_knn)
  82. # 输出当前的训练进度与评分
  83. print(f"Training with {size * 100:.2f}% of the data:")
  84. print(f" - Random Forest CV5 score: {score_rfc}")
  85. print(f" - XGB CV5 score: {score_xgb}")
  86. print(f" - GBST CV5 score: {score_gbst}")
  87. print(f" - KNN CV5 score: {score_knn}")
  88. # 绘制R2评分随训练数据大小变化的图形
  89. import matplotlib.pyplot as plt
  90. plt.plot(train_sizes * 100, cv5_scores_rfc, marker='o', label='Random Forest')
  91. plt.plot(train_sizes * 100, cv5_scores_xgb, marker='x', label='XGBoost')
  92. plt.plot(train_sizes * 100, cv5_scores_gbst, marker='s', label='Gradient Boosting')
  93. plt.plot(train_sizes * 100, cv5_scores_knn, marker='^', label='KNN')
  94. plt.xlabel('Training data size (%)')
  95. plt.ylabel('CV5 Score')
  96. plt.title('Model Performance with Incremental Data')
  97. plt.legend()
  98. plt.grid(True)
  99. plt.show()