GBSTR_filt.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. from sklearn.ensemble import GradientBoostingRegressor as GBSTR
  2. from sklearn.model_selection import cross_val_score,cross_val_predict
  3. from sklearn.model_selection import train_test_split
  4. from sklearn.preprocessing import StandardScaler
  5. from sklearn.metrics import mean_squared_error
  6. import matplotlib.pyplot as plt
  7. import numpy as np
  8. import pandas as pd
  9. # 导入数据
  10. data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  11. x = data.iloc[:,1:10]
  12. y = data.iloc[:,-1]
  13. # 为 x 赋予列名
  14. x.columns = [
  15. 'organic_matter', # OM g/kg
  16. 'chloride', # CL g/kg
  17. 'cec', # CEC cmol/kg
  18. 'h_concentration', # H+ cmol/kg
  19. 'hn', # HN mg/kg
  20. 'al_concentration', # Al3+ cmol/kg
  21. 'free_alumina', # Free alumina g/kg
  22. 'free_iron', # Free iron oxides g/kg
  23. 'delta_ph' # ΔpH
  24. ]
  25. y.name = 'target_ph'
  26. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  27. score_5cv_all = []
  28. for i in np.arange(0.01, 0.5, 0.01):
  29. GBST = GBSTR(learning_rate=i)
  30. score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  31. score_5cv_all.append(score_5cv)
  32. pass
  33. score_max_5cv = max(score_5cv_all)
  34. n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]
  35. print(
  36. "最大5cv得分:{}".format(score_max_5cv),
  37. "lr_5cv:{}".format(n_lr_5cv))
  38. # 随机种子
  39. score_5cv_all = []
  40. for i in range(0, 200, 1):
  41. GBST =GBSTR(learning_rate=n_lr_5cv
  42. ,random_state=i)
  43. score_5cv =cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  44. score_5cv_all.append(score_5cv)
  45. pass
  46. score_max_5cv = max(score_5cv_all)
  47. random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]
  48. print(
  49. "最大5cv得分:{}".format(score_max_5cv),
  50. "random_state_5cv:{}".format(random_state_5cv))
  51. #随机树数目
  52. score_5cv_all = []
  53. for i in range(1, 400, 1):
  54. GBST = GBSTR(n_estimators=i,
  55. learning_rate=n_lr_5cv,
  56. random_state=random_state_5cv)
  57. score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  58. score_5cv_all.append(score_5cv)
  59. pass
  60. score_max_5cv = max(score_5cv_all)
  61. n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]
  62. print(
  63. "最大5cv得分:{}".format(score_max_5cv),
  64. "n_est_5cv:{}".format(n_est_5cv))
  65. score_5cv_all = []
  66. for i in range(1, 300, 1):
  67. GBST = GBSTR(n_estimators=n_est_5cv,
  68. learning_rate=n_lr_5cv,
  69. random_state=random_state_5cv,
  70. max_depth=i)
  71. score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  72. CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
  73. score_5cv_all.append(score_5cv)
  74. pass
  75. score_max_5cv = max(score_5cv_all)
  76. max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
  77. print(
  78. "最大5cv得分:{}".format(score_max_5cv),
  79. "max_depth_5cv:{}".format(max_depth_5cv))
  80. score_5cv_all = []
  81. for i in np.arange(0.01,1.0,0.05):
  82. GBST = GBSTR(n_estimators=n_est_5cv,
  83. learning_rate=n_lr_5cv,
  84. random_state=random_state_5cv,
  85. max_depth=max_depth_5cv,
  86. alpha=i)
  87. score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  88. CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
  89. score_5cv_all.append(score_5cv)
  90. pass
  91. score_max_5cv = max(score_5cv_all)
  92. max_alpha_5cv = np.arange(0.01,1.0,0.05)[score_5cv_all.index(score_max_5cv)]
  93. print(
  94. "最大5cv得分:{}".format(score_max_5cv),
  95. "alpha_5cv:{}".format(max_alpha_5cv))
  96. # 固定参数
  97. GBST = GBSTR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,alpha = max_alpha_5cv)
  98. CV_score = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
  99. CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
  100. rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
  101. regressor = GBST.fit(Xtrain, Ytrain)
  102. test_predictions = regressor.predict(Xtest)
  103. score_test = regressor.score(Xtest,Ytest)
  104. rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
  105. print("5cv:",CV_score)
  106. print("rmse_5CV",rmse1)
  107. print("test:",score_test)
  108. print("rmse_test",rmse2)