import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from model_saver import save_model

# 统计筛选时间
import time
start = time.time()

# 数据加载与划分
data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
x = data.iloc[:,1:]
y = data.iloc[:,0]
x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
y.name = 'target'

Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)

# 网格搜索调参
param_grid = {
    # 遍历2至50，以及100
    'n_estimators': [i for i in range(2, 100)] + [200],
    # 遍历2至10，以及20
    'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=40),
    param_grid=param_grid,
    cv=5,
    # scoring='neg_mean_squared_error',
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(Xtrain, Ytrain)

# 最优模型评估
best_model = grid_search.best_estimator_
train_pred = best_model.predict(Xtrain)
test_pred = best_model.predict(Xtest)

print(f"Best Params: {grid_search.best_params_}")
print(f"Train R²: {r2_score(Ytrain, train_pred):.3f}")
print(f"Test R²: {r2_score(Ytest, test_pred):.3f}")
print(f"Test RMSE: {mean_squared_error(Ytest, test_pred, squared=False):.3f}")

# 模型保存
save_model(best_model, 'model_optimize\pkl', 'rf_model_')

# 统计筛选时间
end = time.time()
print(f"Time: {end - start:.3f}s")