# 导入常用包 import os import pandas as pd import numpy as np # 机器学习模型导入 from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import GradientBoostingRegressor as GBSTR from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor as XGBR # 导入数据处理函数 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler # 导入评分函数 from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from sklearn.metrics import roc_auc_score import pickle from pathlib import Path # 定义数据集配置 DATASET_CONFIGS = { # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph 'soil_acid_9features': { 'file_path': 'model_optimize/data/data_filt.xlsx', 'x_columns': range(1, 10), # 9个特征(包含delta_ph) 'y_column': -1, # 105_day_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ], 'target_name': 'target_ph' }, # 土壤反酸数据:64个样本,8个特征,目标 delta_ph 'soil_acid_8features': { 'file_path': 'model_optimize/data/data_filt - 副本.xlsx', 'x_columns': range(1, 9), # 8个特征 'y_column': -2, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg ], 'target_name': 'target_ph' }, # 土壤反酸数据:64个样本,8个特征,目标 delta_ph 'soil_acid_8features_original': { 'file_path': 'model_optimize/data/data_filt.xlsx', 'x_columns': range(1, 9), # 8个特征 'y_column': -2, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg ], 'target_name': 'target_ph' }, # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph 'soil_acid_6features': { 'file_path': 'model_optimize/data/data_reflux2.xlsx', 'x_columns': range(0, 6), # 6个特征 'y_column': -1, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg ], 'target_name': 'delta_ph' }, # 精准降酸数据:54个样本,5个特征,目标是1/b 'acidity_reduce': { 'file_path': 'model_optimize/data/Acidity_reduce.xlsx', 'x_columns': range(1, 6), # 5个特征 'y_column': 0, # 1/b 'feature_names': [ 'pH', 'OM', 'CL', 'H', 'Al' ], 'target_name': 'target' }, # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b 'acidity_reduce_new': { 'file_path': 'model_optimize/data/Acidity_reduce_new.xlsx', 'x_columns': range(1, 6), # 5个特征 'y_column': 0, # 1/b 'feature_names': [ 'pH', 'OM', 'CL', 'H', 'Al' ], 'target_name': 'target' } } def load_dataset(dataset_name): """ 加载指定的数据集 Args: dataset_name: 数据集配置名称 Returns: x: 特征数据 y: 目标数据 """ if dataset_name not in DATASET_CONFIGS: raise ValueError(f"未知的数据集名称: {dataset_name}") config = DATASET_CONFIGS[dataset_name] data = pd.read_excel(config['file_path']) x = data.iloc[:, config['x_columns']] y = data.iloc[:, config['y_column']] # 设置列名 x.columns = config['feature_names'] y.name = config['target_name'] return x, y # 选择要使用的数据集 # dataset_name = 'soil_acid_9features' # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph # dataset_name = 'soil_acid_8features_original' # 土壤反酸数据:64个样本,8个特征,目标 delta_ph dataset_name = 'soil_acid_8features' # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph # dataset_name = 'soil_acid_6features' # 土壤反酸数据:34个样本,6个特征,目标 delta_ph # dataset_name = 'acidity_reduce' # 精准降酸数据:54个样本,5个特征,目标是1/b # dataset_name = 'acidity_reduce_new' # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b x, y = load_dataset(dataset_name) print("特征数据:") print(x) print("\n目标数据:") print(y) ## 数据集划分 Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) ## 模型 # 随机森林回归模型 rfc = RandomForestRegressor(random_state=1) # XGBR XGB = XGBR(random_state=1) # GBSTR GBST = GBSTR(random_state=1) # KNN KNN = KNeighborsRegressor(n_neighbors=2) # 增量训练:每次增加10%的训练数据 increment = 0.1 # 每次增加的比例 train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小 r2_scores_rfc = [] r2_scores_xgb = [] r2_scores_gbst = [] r2_scores_knn = [] # 用来记录KNN的r2_score # 对于每种训练集大小,训练模型并记录r2_score for size in train_sizes[:10]: # 计算当前训练集的大小 current_size = int(size * len(Xtrain)) # RandomForestRegressor rfc.fit(Xtrain[:current_size], Ytrain[:current_size]) # XGBRegressor XGB.fit(Xtrain[:current_size], Ytrain[:current_size]) # GBST GBST.fit(Xtrain[:current_size], Ytrain[:current_size]) # KNN KNN.fit(Xtrain[:current_size], Ytrain[:current_size]) # r2_score y_pred_rfc = rfc.predict(Xtest) score_rfc = r2_score(Ytest, y_pred_rfc) r2_scores_rfc.append(score_rfc) # XGBRegressor的r2_score y_pred_xgb = XGB.predict(Xtest) score_xgb = r2_score(Ytest, y_pred_xgb) r2_scores_xgb.append(score_xgb) # GBST的r2_score y_pred_gbst = GBST.predict(Xtest) score_gbst = r2_score(Ytest, y_pred_gbst) r2_scores_gbst.append(score_gbst) # KNN的r2_score y_pred_knn = KNN.predict(Xtest) score_knn = r2_score(Ytest, y_pred_knn) r2_scores_knn.append(score_knn) # 输出当前的训练进度与r2评分 print(f"Training with {size * 100:.2f}% of the data:") print(f" - Random Forest R2 score: {score_rfc}") print(f" - XGB R2 score: {score_xgb}") print(f" - GBST R2 score: {score_gbst}") print(f" - KNN R2 score: {score_knn}") # 绘制R2评分随训练数据大小变化的图形 import matplotlib.pyplot as plt plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest') plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost') plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting') # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN') plt.xlabel('Training data size (%)') plt.ylabel('R2 Score') plt.title('Model Performance with Incremental Data') plt.legend() plt.grid(True) plt.show() # 打印JavaScript中需要的数据格式 print("X轴数据(训练数据大小):", [f"{int(size * 100)}%" for size in train_sizes]) print("Random Forest R2分数:", r2_scores_rfc) print("XGBoost R2分数:", r2_scores_xgb) print("Gradient Boosting R2分数:", r2_scores_gbst) y_pred = rfc.predict(Xtest) # 使用任意一个模型,这里以随机森林为例 residuals = Ytest - y_pred plt.scatter(Ytest, y_pred, color='blue', alpha=0.5) plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], color='red', linestyle='--') # 对角线 y=x plt.xlabel('True Values') plt.ylabel('Predicted Values') plt.title('True vs Predicted Values') plt.grid(True) plt.show() # 生成 scatterData scatter_data = [[float(true), float(pred)] for true, pred in zip(Ytest, y_pred)] # 打印 scatterData(可直接复制到 JavaScript 代码中) print("scatterData = ", scatter_data) # # 保存 X_test 和 Y_test 为 CSV 文件 # X_test_df = pd.DataFrame(Xtest) # Y_test_df = pd.DataFrame(Ytest) # # 将 X_test 和 Y_test 保存为 CSV 文件,方便之后加载 # X_test_df.to_csv('X_test_reflux.csv', index=False) # Y_test_df.to_csv('Y_test_reflux.csv', index=False) # # 输出提示信息 # print("X_test 和 Y_test 已保存为 'X_test_reduce.csv' 和 'Y_test_reduce.csv'") # 选择后半部分数据 # import matplotlib.pyplot as plt # # 选择后半部分数据 # half_index = len(train_sizes) // 2 # # 绘制后半部分的数据 # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest') # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost') # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting') # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN') # plt.xlabel('Training data size (%)') # plt.ylabel('R2 Score') # plt.title('Model Performance with Incremental Data (Second Half)') # plt.legend() # plt.grid(True) # plt.show()