# 导入常用包 import os import pandas as pd import numpy as np from PIL import Image from model_saver import save_model # 机器学习模型导入 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import GradientBoostingRegressor as GBSTR from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor as XGBR # 导入数据处理函数 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler # 导入评分函数 from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from sklearn.metrics import roc_auc_score import pickle import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from pathlib import Path # ## 土壤反酸筛选数据 64个样本 9个特征(包含delta_ph) 105_day_ph # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # x = data.iloc[:,1:10] # print(x) # y = data.iloc[:,-1] # print(y) # # 为 x 赋予列名 # x.columns = [ # 'organic_matter', # OM g/kg # 'chloride', # CL g/kg # 'cec', # CEC cmol/kg # 'h_concentration', # H+ cmol/kg # 'hn', # HN mg/kg # 'al_concentration', # Al3+ cmol/kg # 'free_alumina', # Free alumina g/kg # 'free_iron', # Free iron oxides g/kg # 'delta_ph' # ΔpH # ] # y.name = 'target_ph' # ## 土壤反酸筛选数据 64个样本 8个特征 delta_ph # # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # 64个样本 # data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx') # 60个样本(去除异常点) # x = data.iloc[:,1:9] # print(x) # y = data.iloc[:,-2] # print(y) # # 为 x 赋予列名 # x.columns = [ # 'organic_matter', # OM g/kg # 'chloride', # CL g/kg # 'cec', # CEC cmol/kg # 'h_concentration', # H+ cmol/kg # 'hn', # HN mg/kg # 'al_concentration', # Al3+ cmol/kg # 'free_alumina', # Free alumina g/kg # 'free_iron', # Free iron oxides g/kg # ] # y.name = 'target_ph' ## 土壤反酸筛选数据 最新数据:34个样本 6个特征 delta_ph data=pd.read_excel('model_optimize\data\data_reflux2.xlsx') x = data.iloc[:,0:6] print(x) y = data.iloc[:,-1] print(y) # 为 x 赋予列名 x.columns = [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg .... 'al_concentration', # Al3+ cmol/kg ] y.name = 'delta_ph' # ## 精准降酸数据 54个样本 5个特征 1/b # data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx') # x = data.iloc[:,1:] # y = data.iloc[:,0] # # 为 x 赋予列名 # x.columns = [ # 'pH', # 'OM', # 'CL', # 'H', # 'Al' # ] # y.name = 'target' ## 数据集划分 Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) ## 模型 # 随机森林回归模型 rfc = RandomForestRegressor(random_state=1) # XGBR XGB = XGBR(random_state=1) # GBSTR GBST = GBSTR(random_state=1) # KNN KNN = KNeighborsRegressor(n_neighbors=2) # 增量训练:每次增加10%的训练数据 increment = 0.1 # 每次增加的比例 train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小 r2_scores_rfc = [] r2_scores_xgb = [] r2_scores_gbst = [] r2_scores_knn = [] # 用来记录KNN的r2_score # 对于每种训练集大小,训练模型并记录r2_score for size in train_sizes: # 计算当前训练集的大小 current_size = int(size * len(Xtrain)) # RandomForestRegressor rfc.fit(Xtrain[:current_size], Ytrain[:current_size]) # XGBRegressor XGB.fit(Xtrain[:current_size], Ytrain[:current_size]) # GBST GBST.fit(Xtrain[:current_size], Ytrain[:current_size]) # KNN KNN.fit(Xtrain[:current_size], Ytrain[:current_size]) # r2_score y_pred_rfc = rfc.predict(Xtest) score_rfc = r2_score(Ytest, y_pred_rfc) r2_scores_rfc.append(score_rfc) # XGBRegressor的r2_score y_pred_xgb = XGB.predict(Xtest) score_xgb = r2_score(Ytest, y_pred_xgb) r2_scores_xgb.append(score_xgb) # GBST的r2_score y_pred_gbst = GBST.predict(Xtest) score_gbst = r2_score(Ytest, y_pred_gbst) r2_scores_gbst.append(score_gbst) # KNN的r2_score y_pred_knn = KNN.predict(Xtest) score_knn = r2_score(Ytest, y_pred_knn) r2_scores_knn.append(score_knn) # 输出当前的训练进度与r2评分 print(f"Training with {size * 100:.2f}% of the data:") print(f" - Random Forest R2 score: {score_rfc}") print(f" - XGB R2 score: {score_xgb}") print(f" - GBST R2 score: {score_gbst}") print(f" - KNN R2 score: {score_knn}") # 绘制R2评分随训练数据大小变化的图形 import matplotlib.pyplot as plt plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest') plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost') plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting') # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN') plt.xlabel('Training data size (%)') plt.ylabel('R2 Score') plt.title('Model Performance with Incremental Data') plt.legend() plt.grid(True) plt.show() # # 打印JavaScript中需要的数据格式 # print("X轴数据(训练数据大小):", [f"{int(size * 100)}%" for size in train_sizes]) # print("Random Forest R2分数:", r2_scores_rfc) # print("XGBoost R2分数:", r2_scores_xgb) # print("Gradient Boosting R2分数:", r2_scores_gbst) y_pred = rfc.predict(Xtest) # 使用任意一个模型,这里以随机森林为例 residuals = Ytest - y_pred plt.scatter(Ytest, y_pred, color='blue', alpha=0.5) plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], color='red', linestyle='--') # 对角线 y=x plt.xlabel('True Values') plt.ylabel('Predicted Values') plt.title('True vs Predicted Values') plt.grid(True) plt.show() # # 生成 scatterData # scatter_data = [[float(true), float(pred)] for true, pred in zip(Ytest, y_pred)] # # 打印 scatterData(可直接复制到 JavaScript 代码中) # print("scatterData = ", scatter_data) # 保存 X_test 和 Y_test 为 CSV 文件 X_test_df = pd.DataFrame(Xtest) Y_test_df = pd.DataFrame(Ytest) # 将 X_test 和 Y_test 保存为 CSV 文件,方便之后加载 X_test_df.to_csv('X_test_reflux.csv', index=False) Y_test_df.to_csv('Y_test_reflux.csv', index=False) # 输出提示信息 print("X_test 和 Y_test 已保存为 'X_test_reduce.csv' 和 'Y_test_reduce.csv'") # 选择后半部分数据 # import matplotlib.pyplot as plt # # 选择后半部分数据 # half_index = len(train_sizes) // 2 # # 绘制后半部分的数据 # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest') # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost') # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting') # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN') # plt.xlabel('Training data size (%)') # plt.ylabel('R2 Score') # plt.title('Model Performance with Incremental Data (Second Half)') # plt.legend() # plt.grid(True) # plt.show()