import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from model_saver import save_model
import time

# 早停法实现函数
def incremental_training(model, X_train, y_train, X_val, y_val, init_trees=10, step=10, max_trees=500, patience=5):
    """
    使用早停法增量训练随机森林模型
    
    @param {RandomForestRegressor} model - 随机森林模型
    @param {DataFrame} X_train - 训练特征
    @param {Series} y_train - 训练标签
    @param {DataFrame} X_val - 验证特征
    @param {Series} y_val - 验证标签
    @param {int} init_trees - 初始树数量
    @param {int} step - 每次增加的树数量
    @param {int} max_trees - 最大树数量
    @param {int} patience - 允许的停滞轮次
    @return {int} - 最佳树数量
    """
    best_score = -np.inf
    best_n = init_trees
    no_improve = 0
    
    # 初始化模型参数
    model.set_params(n_estimators=init_trees, warm_start=True)
    
    for n in range(init_trees, max_trees + step, step):
        model.n_estimators = n
        model.fit(X_train, y_train)
        current_score = model.score(X_val, y_val)
        
        if current_score > best_score:
            best_score = current_score
            best_n = n
            no_improve = 0
        else:
            no_improve += 1
            
        print(f"当前树数量: {n} | 验证集 R²: {current_score:.4f} | 最佳 R²: {best_score:.4f}")
        
        if no_improve >= patience:
            print(f"在 {n} 棵树时早停")
            break
            
    return best_n

# 数据加载与预处理
data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
x = data.iloc[:,1:]
y = data.iloc[:,0]
x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
y.name = 'target'

# 数据拆分（新增验证集）
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

start = time.time()

# 阶段一：使用早停法找到合适的树数量（使用默认参数）
print("阶段一：使用早停法确定树的数量")
base_model = RandomForestRegressor(
    random_state=40,
    warm_start=True  # 启用增量训练
)

# 执行早停法
optimal_trees = incremental_training(
    model=base_model,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    init_trees=2,    # 初始树的数量
    step=2,         # 每次增加的树数量
    max_trees=100,   # 最大树数量
    patience=30       # 允许的停滞轮次
)

print(f"\n最佳树数量: {optimal_trees}")

# 阶段二：网格搜索调优结构参数
print("\n阶段二：网格搜索调优其他超参数")
param_grid = {
    'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(n_estimators=optimal_trees, random_state=40),
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("\n阶段二最佳参数:", best_params)

# 最终模型训练
final_model = RandomForestRegressor(
    **best_params,
    n_estimators=optimal_trees,
    random_state=40
).fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

# 模型评估
test_pred = final_model.predict(X_test)
print(f"\n最终模型性能:")
print(f"测试集 R²: {r2_score(y_test, test_pred):.3f}")
print(f"测试集 RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.3f}")

# 模型保存
save_model(final_model, 'model_optimize\pkl', 'rf_model_')

# 时间统计
end = time.time()
print(f"\n总耗时: {end - start:.1f}秒")