from sklearn.model_selection import learning_curve import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor import pandas as pd from sklearn.model_selection import train_test_split ## 精准降酸数据 54个样本 5个特征 1/b data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx') x = data.iloc[:,1:] y = data.iloc[:,0] # 为 x 赋予列名 x.columns = [ 'pH', 'OM', 'CL', 'H', 'Al' ] y.name = 'target' # ## 土壤反酸筛选数据 64个样本 9个特征(包含delta_ph) 105_day_ph # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # x = data.iloc[:,1:10] # print(x) # y = data.iloc[:,-1] # print(y) # # 为 x 赋予列名 # x.columns = [ # 'organic_matter', # OM g/kg # 'chloride', # CL g/kg # 'cec', # CEC cmol/kg # 'h_concentration', # H+ cmol/kg # 'hn', # HN mg/kg # 'al_concentration', # Al3+ cmol/kg # 'free_alumina', # Free alumina g/kg # 'free_iron', # Free iron oxides g/kg # 'delta_ph' # ΔpH # ] # y.name = 'target_ph' # ## 土壤反酸筛选数据 64个样本 8个特征 delta_ph # # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # 64个样本 # data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx') # 60个样本(去除异常点) # x = data.iloc[:,1:9] # print(x) # y = data.iloc[:,-2] # print(y) # # 为 x 赋予列名 # x.columns = [ # 'organic_matter', # OM g/kg # 'chloride', # CL g/kg # 'cec', # CEC cmol/kg # 'h_concentration', # H+ cmol/kg # 'hn', # HN mg/kg # 'al_concentration', # Al3+ cmol/kg # 'free_alumina', # Free alumina g/kg # 'free_iron', # Free iron oxides g/kg # ] # y.name = 'target_ph' ## 数据集划分 Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) # 模型:使用 RandomForestRegressor 举例 rfc = RandomForestRegressor(random_state=1) # 计算学习曲线 train_sizes, train_scores, test_scores = learning_curve( rfc, # 使用的模型 Xtrain, # 训练特征 Ytrain, # 训练目标 cv=5, # 交叉验证折数 n_jobs=-1, # 使用所有可用的CPU核心进行并行计算 train_sizes=np.linspace(0.1, 1.0, 10) # 训练集大小,从10%到100%,共10个点 ) # 获取 test_scores(交叉验证测试集的得分) print("test_scores: \n", test_scores) print("train_scores: \n", train_scores) import matplotlib.pyplot as plt import numpy as np # 绘制学习曲线 plt.figure(figsize=(8, 6)) # 绘制训练误差和测试误差 plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training score", color="r") plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Cross-validation score", color="g") # 绘制图形的细节 plt.title("Learning Curve (Random Forest Regressor)") plt.xlabel("Training Size (%)") plt.ylabel("Score (R²)") plt.legend(loc="best") plt.grid(True) plt.show()