123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- from sklearn.model_selection import learning_curve
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn.ensemble import RandomForestRegressor
- from xgboost import XGBRegressor as XGBR
- import pandas as pd
- from sklearn.model_selection import train_test_split
- # 定义数据集配置
- DATASET_CONFIGS = {
- 'soil_acid_9features': {
- 'file_path': 'model_optimize/data/data_filt.xlsx',
- 'x_columns': range(1, 10), # 9个特征(包含delta_ph)
- 'y_column': -1, # 105_day_ph
- 'feature_names': [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg
- 'al_concentration', # Al3+ cmol/kg
- 'free_alumina', # Free alumina g/kg
- 'free_iron', # Free iron oxides g/kg
- 'delta_ph' # ΔpH
- ],
- 'target_name': 'target_ph'
- },
- 'soil_acid_8features': {
- 'file_path': 'model_optimize/data/data_filt - 副本.xlsx',
- 'x_columns': range(1, 9), # 8个特征
- 'y_column': -2, # delta_ph
- 'feature_names': [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg
- 'al_concentration', # Al3+ cmol/kg
- 'free_alumina', # Free alumina g/kg
- 'free_iron', # Free iron oxides g/kg
- ],
- 'target_name': 'target_ph'
- },
- 'soil_acid_8features_original': {
- 'file_path': 'model_optimize/data/data_filt.xlsx',
- 'x_columns': range(1, 9), # 8个特征
- 'y_column': -2, # delta_ph
- 'feature_names': [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg
- 'al_concentration', # Al3+ cmol/kg
- 'free_alumina', # Free alumina g/kg
- 'free_iron', # Free iron oxides g/kg
- ],
- 'target_name': 'target_ph'
- },
- 'soil_acid_6features': {
- 'file_path': 'model_optimize/data/data_reflux2.xlsx',
- 'x_columns': range(0, 6), # 6个特征
- 'y_column': -1, # delta_ph
- 'feature_names': [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg
- 'al_concentration', # Al3+ cmol/kg
- ],
- 'target_name': 'delta_ph'
- },
- 'acidity_reduce': {
- 'file_path': 'model_optimize/data/Acidity_reduce.xlsx',
- 'x_columns': range(1, 6), # 5个特征
- 'y_column': 0, # 1/b
- 'feature_names': [
- 'pH',
- 'OM',
- 'CL',
- 'H',
- 'Al'
- ],
- 'target_name': 'target'
- },
- 'acidity_reduce_new': {
- 'file_path': 'model_optimize/data/Acidity_reduce_new.xlsx',
- 'x_columns': range(1, 6), # 5个特征
- 'y_column': 0, # 1/b
- 'feature_names': [
- 'pH',
- 'OM',
- 'CL',
- 'H',
- 'Al'
- ],
- 'target_name': 'target'
- }
- }
- def load_dataset(dataset_name):
- """
- 加载指定的数据集
-
- Args:
- dataset_name: 数据集配置名称
-
- Returns:
- x: 特征数据
- y: 目标数据
- """
- if dataset_name not in DATASET_CONFIGS:
- raise ValueError(f"未知的数据集名称: {dataset_name}")
-
- config = DATASET_CONFIGS[dataset_name]
- data = pd.read_excel(config['file_path'])
-
- x = data.iloc[:, config['x_columns']]
- y = data.iloc[:, config['y_column']]
-
- # 设置列名
- x.columns = config['feature_names']
- y.name = config['target_name']
-
- return x, y
- # 选择要使用的数据集
- # dataset_name = 'soil_acid_9features' # 土壤反酸数据:64个样本,9个特征(包含delta_ph),目标 105_day_ph
- # dataset_name = 'soil_acid_8features_original' # 土壤反酸数据:64个样本,8个特征,目标 delta_ph
- # dataset_name = 'soil_acid_8features' # 土壤反酸数据:60个样本(去除异常点),8个特征,目标 delta_ph
- # dataset_name = 'soil_acid_6features' # 土壤反酸数据:34个样本,6个特征,目标 delta_ph
- dataset_name = 'acidity_reduce' # 精准降酸数据:54个样本,5个特征,目标是1/b
- # dataset_name = 'acidity_reduce_new' # 精准降酸数据(数据更新):54个样本,5个特征,目标是1/b
- x, y = load_dataset(dataset_name)
- print("特征数据:")
- print(x)
- print("\n目标数据:")
- print(y)
- ## 数据集划分
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
- # 模型:使用 RandomForestRegressor 举例
- rfc = RandomForestRegressor(random_state=1)
- XGB = XGBR(random_state=1)
- # 计算学习曲线
- train_sizes, train_scores, test_scores = learning_curve(
- rfc, # 使用的模型
- Xtrain, # 训练特征
- Ytrain, # 训练目标
- cv=5, # 交叉验证折数
- n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
- train_sizes=np.linspace(0.1, 1.0, 10) # 训练集大小,从10%到100%,共10个点
- )
- # 获取 test_scores(交叉验证测试集的得分)
- print("test_scores: \n", test_scores)
- print("train_scores: \n", train_scores)
- # 绘制学习曲线
- plt.figure(figsize=(8, 6))
- # 将训练样本数转换为百分比
- train_sizes_pct = train_sizes / len(Xtrain) * 100
- # 绘制训练误差和测试误差
- plt.plot(train_sizes_pct, np.mean(train_scores, axis=1), label="Training score", color="r")
- plt.plot(train_sizes_pct, np.mean(test_scores, axis=1), label="Cross-validation score", color="g")
- # 绘制图形的细节
- plt.title("Learning Curve (Random Forest Regressor)")
- plt.xlabel("Training Size (%)")
- plt.ylabel("Score (R²)")
- plt.legend(loc="best")
- plt.grid(True)
- # 设置x轴刻度为10的整数倍
- plt.xticks(np.arange(0, 101, 10))
- plt.show()
|