# 导入常用基本包 import os import pandas as pd import numpy as np from PIL import Image import matplotlib.pyplot as plt import matplotlib from mpl_toolkits.mplot3d import Axes3D import matplotlib.cm as cm # 机器学习相关库 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, log_loss, roc_auc_score # 导入XGBoost from xgboost import XGBRegressor as XGBR # 其他工具 import pickle from pathlib import Path # 导入数据 data=pd.read_excel('model_optimize\data\data_filt.xlsx') x = data.iloc[:,1:10] y = data.iloc[:,-1] # 为 x 赋予列名 x.columns = [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ] y.name = 'target_ph' Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) # score_5cv_all = [] for i in np.arange(0.01, 0.5, 0.01): XGB = XGBR(learning_rate=i) score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "n_lr_5cv:{}".format(n_lr_5cv)) # 随机种子 score_5cv_all = [] for i in range(0, 200, 1): XGB =XGBR(learning_rate=n_lr_5cv ,random_state=i) score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))] print( "最大5cv得分:{}".format(score_max_5cv), "random_5cv:{}".format(random_state_5cv)) #随机树数目 score_5cv_all = [] for i in range(1, 400, 1): XGB = XGBR(n_estimators=i, learning_rate=n_lr_5cv, random_state=random_state_5cv) score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "n_est_5cv:{}".format(n_est_5cv)) score_5cv_all = [] for i in range(1, 300, 1): XGB = XGBR(n_estimators=n_est_5cv, learning_rate=n_lr_5cv, random_state=random_state_5cv, max_depth=i) score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "max_depth_5cv:{}".format(max_depth_5cv)) score_5cv_all = [] for i in np.arange(0,5,0.05): XGB = XGBR(n_estimators=n_est_5cv, learning_rate=n_lr_5cv, random_state=random_state_5cv, max_depth=max_depth_5cv, gamma= i) score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_gamma_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "gamma_5cv:{}".format(max_gamma_5cv)) score_5cv_all = [] for i in np.arange(0,5,0.05): XGB = XGBR(n_estimators=n_est_5cv, learning_rate=n_lr_5cv, random_state=random_state_5cv, max_depth=max_depth_5cv, gamma=max_gamma_5cv, alpha=i) score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() score_5cv_all.append(score_5cv) pass score_max_5cv = max(score_5cv_all) max_alpha_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)] print( "最大5cv得分:{}".format(score_max_5cv), "alpha_5cv:{}".format(max_alpha_5cv)) XGB = XGBR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,gamma =max_gamma_5cv,alpha = max_alpha_5cv) CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5) rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions)) regressor = XGB.fit(Xtrain, Ytrain) test_predictions = regressor.predict(Xtest) score_test = regressor.score(Xtest,Ytest) rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions)) print("5cv:",CV_score) print("rmse_5CV",rmse1) print("test:",score_test) print("rmse_test",rmse2)