XGBR_filt.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # 导入常用基本包
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. from PIL import Image
  6. import matplotlib.pyplot as plt
  7. import matplotlib
  8. from mpl_toolkits.mplot3d import Axes3D
  9. import matplotlib.cm as cm
  10. # 机器学习相关库
  11. from sklearn.ensemble import RandomForestRegressor
  12. from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
  13. from sklearn.preprocessing import StandardScaler, MinMaxScaler
  14. from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, log_loss, roc_auc_score
  15. # 导入XGBoost
  16. from xgboost import XGBRegressor as XGBR
  17. # 其他工具
  18. import pickle
  19. from pathlib import Path
  20. # 导入数据
  21. data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  22. x = data.iloc[:,1:10]
  23. y = data.iloc[:,-1]
  24. # 为 x 赋予列名
  25. x.columns = [
  26. 'organic_matter', # OM g/kg
  27. 'chloride', # CL g/kg
  28. 'cec', # CEC cmol/kg
  29. 'h_concentration', # H+ cmol/kg
  30. 'hn', # HN mg/kg
  31. 'al_concentration', # Al3+ cmol/kg
  32. 'free_alumina', # Free alumina g/kg
  33. 'free_iron', # Free iron oxides g/kg
  34. 'delta_ph' # ΔpH
  35. ]
  36. y.name = 'target_ph'
  37. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  38. #
  39. score_5cv_all = []
  40. for i in np.arange(0.01, 0.5, 0.01):
  41. XGB = XGBR(learning_rate=i)
  42. score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  43. score_5cv_all.append(score_5cv)
  44. pass
  45. score_max_5cv = max(score_5cv_all)
  46. n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]
  47. print(
  48. "最大5cv得分:{}".format(score_max_5cv),
  49. "n_lr_5cv:{}".format(n_lr_5cv))
  50. # 随机种子
  51. score_5cv_all = []
  52. for i in range(0, 200, 1):
  53. XGB =XGBR(learning_rate=n_lr_5cv
  54. ,random_state=i)
  55. score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  56. score_5cv_all.append(score_5cv)
  57. pass
  58. score_max_5cv = max(score_5cv_all)
  59. random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]
  60. print(
  61. "最大5cv得分:{}".format(score_max_5cv),
  62. "random_5cv:{}".format(random_state_5cv))
  63. #随机树数目
  64. score_5cv_all = []
  65. for i in range(1, 400, 1):
  66. XGB = XGBR(n_estimators=i,
  67. learning_rate=n_lr_5cv,
  68. random_state=random_state_5cv)
  69. score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  70. score_5cv_all.append(score_5cv)
  71. pass
  72. score_max_5cv = max(score_5cv_all)
  73. n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]
  74. print(
  75. "最大5cv得分:{}".format(score_max_5cv),
  76. "n_est_5cv:{}".format(n_est_5cv))
  77. score_5cv_all = []
  78. for i in range(1, 300, 1):
  79. XGB = XGBR(n_estimators=n_est_5cv,
  80. learning_rate=n_lr_5cv,
  81. random_state=random_state_5cv,
  82. max_depth=i)
  83. score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  84. score_5cv_all.append(score_5cv)
  85. pass
  86. score_max_5cv = max(score_5cv_all)
  87. max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
  88. print(
  89. "最大5cv得分:{}".format(score_max_5cv),
  90. "max_depth_5cv:{}".format(max_depth_5cv))
  91. score_5cv_all = []
  92. for i in np.arange(0,5,0.05):
  93. XGB = XGBR(n_estimators=n_est_5cv,
  94. learning_rate=n_lr_5cv,
  95. random_state=random_state_5cv,
  96. max_depth=max_depth_5cv,
  97. gamma= i)
  98. score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  99. score_5cv_all.append(score_5cv)
  100. pass
  101. score_max_5cv = max(score_5cv_all)
  102. max_gamma_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
  103. print(
  104. "最大5cv得分:{}".format(score_max_5cv),
  105. "gamma_5cv:{}".format(max_gamma_5cv))
  106. score_5cv_all = []
  107. for i in np.arange(0,5,0.05):
  108. XGB = XGBR(n_estimators=n_est_5cv,
  109. learning_rate=n_lr_5cv,
  110. random_state=random_state_5cv,
  111. max_depth=max_depth_5cv,
  112. gamma=max_gamma_5cv,
  113. alpha=i)
  114. score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  115. score_5cv_all.append(score_5cv)
  116. pass
  117. score_max_5cv = max(score_5cv_all)
  118. max_alpha_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
  119. print(
  120. "最大5cv得分:{}".format(score_max_5cv),
  121. "alpha_5cv:{}".format(max_alpha_5cv))
  122. XGB = XGBR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,gamma =max_gamma_5cv,alpha = max_alpha_5cv)
  123. CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
  124. CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
  125. rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
  126. regressor = XGB.fit(Xtrain, Ytrain)
  127. test_predictions = regressor.predict(Xtest)
  128. score_test = regressor.score(Xtest,Ytest)
  129. rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
  130. print("5cv:",CV_score)
  131. print("rmse_5CV",rmse1)
  132. print("test:",score_test)
  133. print("rmse_test",rmse2)