from sklearn.model_selection import learning_curve import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor as XGBR import pandas as pd from sklearn.model_selection import train_test_split # 定义数据集配置 DATASET_CONFIGS = { 'soil_acid_9features': { 'file_path': 'model_optimize/data/data_filt.xlsx', 'x_columns': range(1, 10), # 9个特征(包含delta_ph) 'y_column': -1, # 105_day_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg 'delta_ph' # ΔpH ], 'target_name': 'target_ph' }, 'soil_acid_8features': { 'file_path': 'model_optimize/data/data_filt - 副本.xlsx', 'x_columns': range(1, 9), # 8个特征 'y_column': -2, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg ], 'target_name': 'target_ph' }, 'soil_acid_8features_original': { 'file_path': 'model_optimize/data/data_filt.xlsx', 'x_columns': range(1, 9), # 8个特征 'y_column': -2, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg 'free_alumina', # Free alumina g/kg 'free_iron', # Free iron oxides g/kg ], 'target_name': 'target_ph' }, 'soil_acid_6features': { 'file_path': 'model_optimize/data/data_reflux2.xlsx', 'x_columns': range(0, 6), # 6个特征 'y_column': -1, # delta_ph 'feature_names': [ 'organic_matter', # OM g/kg 'chloride', # CL g/kg 'cec', # CEC cmol/kg 'h_concentration', # H+ cmol/kg 'hn', # HN mg/kg 'al_concentration', # Al3+ cmol/kg ], 'target_name': 'delta_ph' }, 'acidity_reduce': { 'file_path': 'model_optimize/data/Acidity_reduce.xlsx', 'x_columns': range(1, 6), # 5个特征 'y_column': 0, # 1/b 'feature_names': [ 'pH', 'OM', 'CL', 'H', 'Al' ], 'target_name': 'target' }, 'acidity_reduce_new': { 'file_path': 'model_optimize/data/Acidity_reduce_new.xlsx', 'x_columns': range(1, 6), # 5个特征 'y_column': 0, # 1/b 'feature_names': [ 'pH', 'OM', 'CL', 'H', 'Al' ], 'target_name': 'target' } } def load_dataset(dataset_name): """ 加载指定的数据集 Args: dataset_name: 数据集配置名称 Returns: x: 特征数据 y: 目标数据 """ if dataset_name not in DATASET_CONFIGS: raise ValueError(f"未知的数据集名称: {dataset_name}") config = DATASET_CONFIGS[dataset_name] data = pd.read_excel(config['file_path']) x = data.iloc[:, config['x_columns']] y = data.iloc[:, config['y_column']] # 设置列名 x.columns = config['feature_names'] y.name = config['target_name'] return x, y # 选择要使用的数据集 # dataset_name = 'soil_acid_9features' # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph # dataset_name = 'soil_acid_8features_original' # 土壤反酸数据:64个样本,8个特征,目标 delta_ph # dataset_name = 'soil_acid_8features' # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph # dataset_name = 'soil_acid_6features' # 土壤反酸数据:34个样本,6个特征,目标 delta_ph dataset_name = 'acidity_reduce' # 精准降酸数据:54个样本,5个特征,目标是1/b # dataset_name = 'acidity_reduce_new' # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b x, y = load_dataset(dataset_name) print("特征数据:") print(x) print("\n目标数据:") print(y) ## 数据集划分 Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) # 模型:使用 RandomForestRegressor 举例 rfc = RandomForestRegressor(random_state=1) XGB = XGBR(random_state=1) # 计算学习曲线 train_sizes, train_scores, test_scores = learning_curve( rfc, # 使用的模型 Xtrain, # 训练特征 Ytrain, # 训练目标 cv=5, # 交叉验证折数 n_jobs=-1, # 使用所有可用的CPU核心进行并行计算 train_sizes=np.linspace(0.1, 1.0, 10) # 训练集大小,从10%到100%,共10个点 ) # 获取 test_scores(交叉验证测试集的得分) print("test_scores: \n", test_scores) print("train_scores: \n", train_scores) # 绘制学习曲线 plt.figure(figsize=(8, 6)) # 将训练样本数转换为百分比 train_sizes_pct = train_sizes / len(Xtrain) * 100 # 绘制训练误差和测试误差 plt.plot(train_sizes_pct, np.mean(train_scores, axis=1), label="Training score", color="r") plt.plot(train_sizes_pct, np.mean(test_scores, axis=1), label="Cross-validation score", color="g") # 绘制图形的细节 plt.title("Learning Curve (Random Forest Regressor)") plt.xlabel("Training Size (%)") plt.ylabel("Score (R²)") plt.legend(loc="best") plt.grid(True) # 设置x轴刻度为10的整数倍 plt.xticks(np.arange(0, 101, 10)) plt.show()