RF_filt.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. # '''
  2. # 模型筛选
  3. # '''
  4. ## 导入常用基本包
  5. import os
  6. import pandas as pd
  7. import numpy as np
  8. from PIL import Image
  9. from model_saver import save_model
  10. # 机器学习模型导入
  11. from sklearn.ensemble import RandomForestRegressor
  12. from sklearn.model_selection import cross_val_score,cross_val_predict
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.metrics import mean_squared_error
  15. ## 导入常用辅助函数
  16. from sklearn.model_selection import train_test_split
  17. from sklearn.model_selection import GridSearchCV
  18. from sklearn.model_selection import cross_val_score
  19. from sklearn.model_selection import cross_val_predict
  20. ## 导入数据处理函数
  21. from sklearn.preprocessing import StandardScaler
  22. from sklearn.preprocessing import MinMaxScaler
  23. ## 导入评分函数
  24. from sklearn.metrics import r2_score
  25. from sklearn.metrics import mean_squared_error
  26. from sklearn.metrics import mean_absolute_error
  27. from sklearn.metrics import accuracy_score
  28. from sklearn.metrics import log_loss
  29. from sklearn.metrics import roc_auc_score
  30. # ## 土壤反酸筛选数据
  31. # data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  32. # x = data.iloc[:,1:10]
  33. # y = data.iloc[:,-1]
  34. # # 为 x 赋予列名
  35. # x.columns = [
  36. # 'organic_matter', # OM g/kg
  37. # 'chloride', # CL g/kg
  38. # 'cec', # CEC cmol/kg
  39. # 'h_concentration', # H+ cmol/kg
  40. # 'hn', # HN mg/kg
  41. # 'al_concentration', # Al3+ cmol/kg
  42. # 'free_alumina', # Free alumina g/kg
  43. # 'free_iron', # Free iron oxides g/kg
  44. # 'delta_ph' # ΔpH
  45. # ]
  46. # y.name = 'target_ph'
  47. ## 精准降酸数据
  48. data=pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
  49. x = data.iloc[:,1:]
  50. y = data.iloc[:,0]
  51. # 为 x 赋予列名
  52. x.columns = [
  53. 'pH',
  54. 'OM',
  55. 'CL',
  56. 'H',
  57. 'Al'
  58. ]
  59. y.name = 'target'
  60. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  61. # 筛选随机种子
  62. # score_5cv_all = []
  63. # for i in range(0, 200, 1):
  64. # rfc =RandomForestRegressor(random_state=i)
  65. # score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
  66. # score_5cv_all.append(score_5cv)
  67. # pass
  68. # score_max_5cv = max(score_5cv_all)
  69. # random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))] # 5cv最大得分对应的随机种子
  70. # print("最大5cv得分:{}".format(score_max_5cv),
  71. # "random_5cv:{}".format(random_state_5cv))
  72. random_state_5cv = 40
  73. # 筛选随机树数目
  74. score_5cv_all = []
  75. for i in range(1, 200, 1):
  76. rfc = RandomForestRegressor(n_estimators=i, random_state=random_state_5cv)
  77. score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
  78. score_5cv_all.append(score_5cv)
  79. pass
  80. score_max_5cv = max(score_5cv_all)
  81. n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)] # 5cv最大得分对应的树数目
  82. print("最大5cv得分:{}".format(score_max_5cv),
  83. "n_est_5cv:{}".format(n_est_5cv)) # 5cv最大得分对应的树数目??
  84. score_test_all = []
  85. # 筛选最大深度
  86. score_5cv_all = []
  87. for i in range(1, 200, 1):
  88. rfc = RandomForestRegressor(n_estimators=n_est_5cv
  89. , random_state=random_state_5cv
  90. , max_depth=i)
  91. score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
  92. score_5cv_all.append(score_5cv)
  93. pass
  94. score_max_5cv = max(score_5cv_all)
  95. max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
  96. print("最大5cv得分:{}".format(score_max_5cv),
  97. "max_depth_5cv:{}".format(max_depth_5cv))
  98. # 确定参数进行训练
  99. rfc = RandomForestRegressor(n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv)
  100. CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
  101. CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
  102. rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
  103. regressor = rfc.fit(Xtrain, Ytrain)
  104. test_predictions = regressor.predict(Xtest)
  105. score_test = regressor.score(Xtest,Ytest)
  106. rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
  107. print("5cv:",CV_score)
  108. print("rmse_5CV",rmse1)
  109. print("test:",score_test)
  110. print("rmse_test",rmse2)
  111. # 保存训练好的模型
  112. custom_path='model_optimize\pkl' # 模型保存路径
  113. prefix='rf_model_raw_' # 模型文件名前缀
  114. save_model(rfc, custom_path, prefix)