import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from model_saver import save_model

def train_random_forest(X_train, y_train):
    """
    训练随机森林回归模型并优化超参数
    
    @param {DataFrame} X_train - 训练特征数据
    @param {Series} y_train - 训练目标数据
    @return {tuple} - 返回最佳模型和最佳参数(模型, 最佳树数量, 最佳深度)
    """
    best_score = -float('inf')
    best_n_estimators = None
    best_max_depth = None
    random_state = 40

    # 筛选最佳的树的数量
    for n_estimators in range(1, 20, 1):
        model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        score = cross_val_score(model, X_train, y_train, cv=5).mean()
        if score > best_score:
            best_score = score
            best_n_estimators = n_estimators

    print(f"Best number of trees: {best_n_estimators}, Score: {best_score}")

    # 在找到的最佳树的数量基础上，筛选最佳的最大深度
    best_score = 0  # 重置最佳得分，为最大深度优化做准备
    for max_depth in range(1, 5, 1):
        model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state)
        score = cross_val_score(model, X_train, y_train, cv=5).mean()
        if score > best_score:
            best_score = score
            best_max_depth = max_depth

    print(f"Best max depth: {best_max_depth}, Score: {best_score}")

    # 使用最佳的树的数量和最大深度训练最终模型
    best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth,
                                       random_state=random_state)

    # 传入列名进行训练
    best_model.fit(X_train, y_train)
    # 指定传入的特征名
    best_model.feature_names_in_ = X_train.columns
    return best_model, best_n_estimators, best_max_depth


# 统计筛选时间
import time
start = time.time()

# 数据加载与划分
data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
x = data.iloc[:,1:]
y = data.iloc[:,0]
x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
y.name = 'target'

Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)

# 最优模型评估
best_model, best_n_estimators, best_max_depth = train_random_forest(Xtrain, Ytrain)
train_pred = best_model.predict(Xtrain)
test_pred = best_model.predict(Xtest)

print(f"Best Params: {{'n_estimators': {best_n_estimators}, 'max_depth': {best_max_depth}, 'min_samples_split': 2, 'min_samples_leaf': 1}}")
print(f"Train R²: {r2_score(Ytrain, train_pred):.3f}")
print(f"Test R²: {r2_score(Ytest, test_pred):.3f}")
print(f"Test RMSE: {mean_squared_error(Ytest, test_pred, squared=False):.3f}")

# 模型保存
save_model(best_model, 'model_optimize\pkl', 'rf_model_web_')

# 统计筛选时间
end = time.time()
print(f"Time: {end - start:.3f}s")