from sklearn.ensemble import GradientBoostingRegressor as GBSTR from sklearn.model_selection import cross_val_score,cross_val_predict from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt import numpy as np import pandas as pd # 导入数据 data=pd.read_excel('model_optimize\data\data_filt.xlsx') x = data.iloc[:,1:10] y = data.iloc[:,-1] # 为 x 赋予列名 x.columns = [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ] y.name = 'target_ph' Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) score_5cv_all = [] for i in np.arange(0.01, 0.5, 0.01): GBST = GBSTR(learning_rate=i) score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "lr_5cv:{}".format(n_lr_5cv)) # 随机种子 score_5cv_all = [] for i in range(0, 200, 1): GBST =GBSTR(learning_rate=n_lr_5cv ,random_state=i) score_5cv =cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))] print( "最大5cv得分:{}".format(score_max_5cv), "random_state_5cv:{}".format(random_state_5cv)) #随机树数目 score_5cv_all = [] for i in range(1, 400, 1): GBST = GBSTR(n_estimators=i, learning_rate=n_lr_5cv, random_state=random_state_5cv) score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "n_est_5cv:{}".format(n_est_5cv)) score_5cv_all = [] for i in range(1, 300, 1): GBST = GBSTR(n_estimators=n_est_5cv, learning_rate=n_lr_5cv, random_state=random_state_5cv, max_depth=i) score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5) score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "max_depth_5cv:{}".format(max_depth_5cv)) score_5cv_all = [] for i in np.arange(0.01,1.0,0.05): GBST = GBSTR(n_estimators=n_est_5cv, learning_rate=n_lr_5cv, random_state=random_state_5cv, max_depth=max_depth_5cv, alpha=i) score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5) score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_alpha_5cv = np.arange(0.01,1.0,0.05)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "alpha_5cv:{}".format(max_alpha_5cv)) # 固定参数 GBST = GBSTR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,alpha = max_alpha_5cv) CV_score = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5) rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions)) regressor = GBST.fit(Xtrain, Ytrain) test_predictions = regressor.predict(Xtest) score_test = regressor.score(Xtest,Ytest) rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions)) print("5cv:",CV_score) print("rmse_5CV",rmse1) print("test:",score_test) print("rmse_test",rmse2)