# ''' # 模型筛选 # ''' ## 导入常用基本包 import os import pandas as pd import numpy as np from PIL import Image from model_saver import save_model # 机器学习模型导入 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score,cross_val_predict from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ## 导入常用辅助函数 from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict ## 导入数据处理函数 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler ## 导入评分函数 from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from sklearn.metrics import roc_auc_score # ## 土壤反酸筛选数据 # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # x = data.iloc[:,1:10] # y = data.iloc[:,-1] # # 为 x 赋予列名 # x.columns = [ # 'organic_matter', # OM g/kg # 'chloride', # CL g/kg # 'cec', # CEC cmol/kg # 'h_concentration', # H+ cmol/kg # 'hn', # HN mg/kg # 'al_concentration', # Al3+ cmol/kg # 'free_alumina', # Free alumina g/kg # 'free_iron', # Free iron oxides g/kg # 'delta_ph' # ΔpH # ] # y.name = 'target_ph' ## 精准降酸数据 data=pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx') x = data.iloc[:,1:] y = data.iloc[:,0] # 为 x 赋予列名 x.columns = [ 'pH', 'OM', 'CL', 'H', 'Al' ] y.name = 'target' Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) # 筛选随机种子 # score_5cv_all = [] # for i in range(0, 200, 1): # rfc =RandomForestRegressor(random_state=i) # score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() # score_5cv_all.append(score_5cv) # pass # score_max_5cv = max(score_5cv_all) # random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))] # 5cv最大得分对应的随机种子 # print("最大5cv得分:{}".format(score_max_5cv), # "random_5cv:{}".format(random_state_5cv)) random_state_5cv = 40 # 筛选随机树数目 score_5cv_all = [] for i in range(1, 200, 1): rfc = RandomForestRegressor(n_estimators=i, random_state=random_state_5cv) score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)] # 5cv最大得分对应的树数目 print("最大5cv得分:{}".format(score_max_5cv), "n_est_5cv:{}".format(n_est_5cv)) # 5cv最大得分对应的树数目?? score_test_all = [] # 筛选最大深度 score_5cv_all = [] for i in range(1, 200, 1): rfc = RandomForestRegressor(n_estimators=n_est_5cv , random_state=random_state_5cv , max_depth=i) score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)] print("最大5cv得分:{}".format(score_max_5cv), "max_depth_5cv:{}".format(max_depth_5cv)) # 确定参数进行训练 rfc = RandomForestRegressor(n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv) CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5) rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions)) regressor = rfc.fit(Xtrain, Ytrain) test_predictions = regressor.predict(Xtest) score_test = regressor.score(Xtest,Ytest) rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions)) print("5cv:",CV_score) print("rmse_5CV",rmse1) print("test:",score_test) print("rmse_test",rmse2) # 保存训练好的模型 custom_path='model_optimize\pkl' # 模型保存路径 prefix='rf_model_raw_' # 模型文件名前缀 save_model(rfc, custom_path, prefix)