# ''' # 模型筛选 # ''' ## 导入常用基本包 import os import pandas as pd import numpy as np from PIL import Image from model_saver import save_model # 机器学习模型导入 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score,cross_val_predict from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ## 导入常用辅助函数 from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict ## 导入数据处理函数 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler ## 导入评分函数 from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from sklearn.metrics import roc_auc_score # 导入数据 data=pd.read_excel('model_optimize\data\data_filt.xlsx') x = data.iloc[:,1:10] y = data.iloc[:,-1] # 为 x 赋予列名 x.columns = [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ] y.name = 'target_ph' Xtrain, Xtest, Ytrain, Ytest=train_test_split(x, y, test_size=0.2) # 筛选随机种子 score_5cv_all = [] for i in range(0, 200, 1): rfc =RandomForestRegressor(random_state=i) score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))] # 5cv最大得分对应的随机种子 print("最大5cv得分:{}".format(score_max_5cv), "random_5cv:{}".format(random_state_5cv)) # 筛选随机树数目 score_5cv_all = [] for i in range(1, 400, 1): rfc = RandomForestRegressor(n_estimators=i, random_state=random_state_5cv) score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)] # 5cv最大得分对应的树数目 print("最大5cv得分:{}".format(score_max_5cv), "n_est_5cv:{}".format(n_est_5cv)) # 5cv最大得分对应的树数目?? score_test_all = [] # 筛选最大深度 score_5cv_all = [] for i in range(1, 300, 1): rfc = RandomForestRegressor(n_estimators=n_est_5cv , random_state=random_state_5cv , max_depth=i) score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "max_depth_5cv:{}".format(max_depth_5cv)) # 确定参数进行训练 rfc = RandomForestRegressor(n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv) CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5) rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions)) regressor = rfc.fit(Xtrain, Ytrain) test_predictions = regressor.predict(Xtest) score_test = regressor.score(Xtest,Ytest) rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions)) print("5cv:",CV_score) print("rmse_5CV",rmse1) print("test:",score_test) print("rmse_test",rmse2) # 保存训练好的模型 custom_path='model_optimize\pkl' # 模型保存路径 prefix='rf_model_' # 模型文件名前缀 save_model(rfc, custom_path, prefix)