# 导入常用包 import os import pandas as pd import numpy as np from PIL import Image from model_saver import save_model # 机器学习模型导入 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import GradientBoostingRegressor as GBSTR from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor as XGBR # 导入数据处理函数 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler # 导入评分函数 from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from sklearn.metrics import roc_auc_score from sklearn.model_selection import cross_val_score import pickle import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from pathlib import Path # 导入数据 # data = pd.read_excel('model_optimize/data/data_filt.xlsx') data = pd.read_excel('data/data_filt.xlsx') x = data.iloc[:, 1:10] y = data.iloc[:, -1] # 为 x 赋予列名 x.columns = [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ] y.name = 'target_ph' Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) ## 模型 # 随机森林回归模型 rfc = RandomForestRegressor(random_state=1) # XGBR XGB = XGBR(random_state=1) # GBSTR GBST = GBSTR(random_state=1) # KNN KNN = KNeighborsRegressor(n_neighbors=5) # 增量训练:每次增加10%的训练数据 increment = 0.1 # 每次增加的比例 train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小 # 记录各模型交叉验证评分 cv5_scores_rfc = [] cv5_scores_xgb = [] cv5_scores_gbst = [] cv5_scores_knn = [] # 对于每种训练集大小,训练模型并记录r2_score for size in train_sizes: # 计算当前训练集的大小 current_size = int(size * len(Xtrain)) # 交叉验证评分 score_rfc = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean() cv5_scores_rfc.append(score_rfc) # XGBRegressor的交叉验证评分 score_xgb = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean() cv5_scores_xgb.append(score_xgb) # GBST的交叉验证评分 score_gbst = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean() cv5_scores_gbst.append(score_gbst) # KNN的交叉验证评分 score_knn = cross_val_score(KNN, Xtrain, Ytrain, cv=5).mean() cv5_scores_knn.append(score_knn) # 输出当前的训练进度与评分 print(f"Training with {size * 100:.2f}% of the data:") print(f" - Random Forest CV5 score: {score_rfc}") print(f" - XGB CV5 score: {score_xgb}") print(f" - GBST CV5 score: {score_gbst}") print(f" - KNN CV5 score: {score_knn}") # 绘制R2评分随训练数据大小变化的图形 import matplotlib.pyplot as plt plt.plot(train_sizes * 100, cv5_scores_rfc, marker='o', label='Random Forest') plt.plot(train_sizes * 100, cv5_scores_xgb, marker='x', label='XGBoost') plt.plot(train_sizes * 100, cv5_scores_gbst, marker='s', label='Gradient Boosting') plt.plot(train_sizes * 100, cv5_scores_knn, marker='^', label='KNN') plt.xlabel('Training data size (%)') plt.ylabel('CV5 Score') plt.title('Model Performance with Incremental Data') plt.legend() plt.grid(True) plt.show()