from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import train_test_split


## 精准降酸数据  54个样本 5个特征  1/b
data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')

x = data.iloc[:,1:]
y = data.iloc[:,0]
# 为 x 赋予列名
x.columns = [
    'pH',        
    'OM',              
    'CL', 
    'H',
    'Al'
]
y.name = 'target'

# ## 土壤反酸筛选数据  64个样本 9个特征(包含delta_ph)  105_day_ph
# data=pd.read_excel('model_optimize\data\data_filt.xlsx')   

# x = data.iloc[:,1:10]
# print(x)
# y = data.iloc[:,-1]
# print(y)

# # 为 x 赋予列名
# x.columns = [
#     'organic_matter',        # OM g/kg
#     'chloride',              # CL g/kg
#     'cec',                   # CEC cmol/kg
#     'h_concentration',       # H+ cmol/kg
#     'hn',                    # HN mg/kg
#     'al_concentration',      # Al3+ cmol/kg
#     'free_alumina',          # Free alumina g/kg
#     'free_iron',             # Free iron oxides g/kg
#     'delta_ph'               # ΔpH
# ]

# y.name = 'target_ph'

# ## 土壤反酸筛选数据   64个样本 8个特征 delta_ph
# # data=pd.read_excel('model_optimize\data\data_filt.xlsx')    # 64个样本
# data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx')   # 60个样本（去除异常点）

# x = data.iloc[:,1:9]
# print(x)

# y = data.iloc[:,-2]
# print(y)

# # 为 x 赋予列名
# x.columns = [
#     'organic_matter',        # OM g/kg
#     'chloride',              # CL g/kg
#     'cec',                   # CEC cmol/kg
#     'h_concentration',       # H+ cmol/kg
#     'hn',                    # HN mg/kg
#     'al_concentration',      # Al3+ cmol/kg
#     'free_alumina',          # Free alumina g/kg
#     'free_iron',             # Free iron oxides g/kg
# ]

# y.name = 'target_ph'

## 数据集划分
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)


# 模型：使用 RandomForestRegressor 举例
rfc = RandomForestRegressor(random_state=1)

# 计算学习曲线
train_sizes, train_scores, test_scores = learning_curve(
    rfc,                         # 使用的模型
    Xtrain,                           # 训练特征
    Ytrain,                           # 训练目标
    cv=5,                         # 交叉验证折数
    n_jobs=-1,                    # 使用所有可用的CPU核心进行并行计算
    train_sizes=np.linspace(0.1, 1.0, 10)  # 训练集大小，从10%到100%，共10个点
)

# 获取 test_scores（交叉验证测试集的得分）
print("test_scores: \n", test_scores)
print("train_scores: \n", train_scores)

import matplotlib.pyplot as plt
import numpy as np

# 绘制学习曲线
plt.figure(figsize=(8, 6))

# 绘制训练误差和测试误差
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training score", color="r")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Cross-validation score", color="g")

# 绘制图形的细节
plt.title("Learning Curve (Random Forest Regressor)")
plt.xlabel("Training Size (%)")
plt.ylabel("Score (R²)")
plt.legend(loc="best")
plt.grid(True)
plt.show()