|
@@ -0,0 +1,148 @@
|
|
|
+# 导入常用基本包
|
|
|
+import os
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from PIL import Image
|
|
|
+import matplotlib.pyplot as plt
|
|
|
+import matplotlib
|
|
|
+from mpl_toolkits.mplot3d import Axes3D
|
|
|
+import matplotlib.cm as cm
|
|
|
+
|
|
|
+# 机器学习相关库
|
|
|
+from sklearn.ensemble import RandomForestRegressor
|
|
|
+from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
|
|
|
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
|
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, log_loss, roc_auc_score
|
|
|
+
|
|
|
+# 导入XGBoost
|
|
|
+from xgboost import XGBRegressor as XGBR
|
|
|
+
|
|
|
+# 其他工具
|
|
|
+import pickle
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+# 导入数据
|
|
|
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')
|
|
|
+x = data.iloc[:,1:10]
|
|
|
+y = data.iloc[:,-1]
|
|
|
+
|
|
|
+# 为 x 赋予列名
|
|
|
+x.columns = [
|
|
|
+ 'organic_matter', # OM g/kg
|
|
|
+ 'chloride', # CL g/kg
|
|
|
+ 'cec', # CEC cmol/kg
|
|
|
+ 'h_concentration', # H+ cmol/kg
|
|
|
+ 'hn', # HN mg/kg
|
|
|
+ 'al_concentration', # Al3+ cmol/kg
|
|
|
+ 'free_alumina', # Free alumina g/kg
|
|
|
+ 'free_iron', # Free iron oxides g/kg
|
|
|
+ 'delta_ph' # ΔpH
|
|
|
+]
|
|
|
+
|
|
|
+y.name = 'target_ph'
|
|
|
+
|
|
|
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
|
|
|
+
|
|
|
+#
|
|
|
+score_5cv_all = []
|
|
|
+for i in np.arange(0.01, 0.5, 0.01):
|
|
|
+ XGB = XGBR(learning_rate=i)
|
|
|
+ score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "n_lr_5cv:{}".format(n_lr_5cv))
|
|
|
+
|
|
|
+# 随机种子
|
|
|
+score_5cv_all = []
|
|
|
+for i in range(0, 200, 1):
|
|
|
+ XGB =XGBR(learning_rate=n_lr_5cv
|
|
|
+ ,random_state=i)
|
|
|
+ score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "random_5cv:{}".format(random_state_5cv))
|
|
|
+
|
|
|
+#随机树数目
|
|
|
+score_5cv_all = []
|
|
|
+for i in range(1, 400, 1):
|
|
|
+ XGB = XGBR(n_estimators=i,
|
|
|
+ learning_rate=n_lr_5cv,
|
|
|
+ random_state=random_state_5cv)
|
|
|
+ score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]
|
|
|
+
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "n_est_5cv:{}".format(n_est_5cv))
|
|
|
+
|
|
|
+score_5cv_all = []
|
|
|
+for i in range(1, 300, 1):
|
|
|
+ XGB = XGBR(n_estimators=n_est_5cv,
|
|
|
+ learning_rate=n_lr_5cv,
|
|
|
+ random_state=random_state_5cv,
|
|
|
+ max_depth=i)
|
|
|
+ score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "max_depth_5cv:{}".format(max_depth_5cv))
|
|
|
+
|
|
|
+score_5cv_all = []
|
|
|
+for i in np.arange(0,5,0.05):
|
|
|
+ XGB = XGBR(n_estimators=n_est_5cv,
|
|
|
+ learning_rate=n_lr_5cv,
|
|
|
+ random_state=random_state_5cv,
|
|
|
+ max_depth=max_depth_5cv,
|
|
|
+ gamma= i)
|
|
|
+ score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+max_gamma_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "gamma_5cv:{}".format(max_gamma_5cv))
|
|
|
+
|
|
|
+score_5cv_all = []
|
|
|
+for i in np.arange(0,5,0.05):
|
|
|
+ XGB = XGBR(n_estimators=n_est_5cv,
|
|
|
+ learning_rate=n_lr_5cv,
|
|
|
+ random_state=random_state_5cv,
|
|
|
+ max_depth=max_depth_5cv,
|
|
|
+ gamma=max_gamma_5cv,
|
|
|
+ alpha=i)
|
|
|
+ score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+ score_5cv_all.append(score_5cv)
|
|
|
+ pass
|
|
|
+score_max_5cv = max(score_5cv_all)
|
|
|
+max_alpha_5cv = np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
|
|
|
+print(
|
|
|
+ "最大5cv得分:{}".format(score_max_5cv),
|
|
|
+ "alpha_5cv:{}".format(max_alpha_5cv))
|
|
|
+
|
|
|
+XGB = XGBR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,gamma =max_gamma_5cv,alpha = max_alpha_5cv)
|
|
|
+CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
|
|
|
+CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
|
|
|
+rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
|
|
|
+regressor = XGB.fit(Xtrain, Ytrain)
|
|
|
+test_predictions = regressor.predict(Xtest)
|
|
|
+score_test = regressor.score(Xtest,Ytest)
|
|
|
+rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
|
|
|
+print("5cv:",CV_score)
|
|
|
+print("rmse_5CV",rmse1)
|
|
|
+print("test:",score_test)
|
|
|
+print("rmse_test",rmse2)
|