import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import mean_squared_error, r2_score from model_saver import save_model import time # 早停法实现函数 def incremental_training(model, X_train, y_train, X_val, y_val, init_trees=10, step=10, max_trees=500, patience=5): """ 使用早停法增量训练随机森林模型 @param {RandomForestRegressor} model - 随机森林模型 @param {DataFrame} X_train - 训练特征 @param {Series} y_train - 训练标签 @param {DataFrame} X_val - 验证特征 @param {Series} y_val - 验证标签 @param {int} init_trees - 初始树数量 @param {int} step - 每次增加的树数量 @param {int} max_trees - 最大树数量 @param {int} patience - 允许的停滞轮次 @return {int} - 最佳树数量 """ best_score = -np.inf best_n = init_trees no_improve = 0 # 初始化模型参数 model.set_params(n_estimators=init_trees, warm_start=True) for n in range(init_trees, max_trees + step, step): model.n_estimators = n model.fit(X_train, y_train) current_score = model.score(X_val, y_val) if current_score > best_score: best_score = current_score best_n = n no_improve = 0 else: no_improve += 1 print(f"当前树数量: {n} | 验证集 R²: {current_score:.4f} | 最佳 R²: {best_score:.4f}") if no_improve >= patience: print(f"在 {n} 棵树时早停") break return best_n # 数据加载与预处理 data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx') x = data.iloc[:,1:] y = data.iloc[:,0] x.columns = ['pH', 'OM', 'CL', 'H', 'Al'] y.name = 'target' # 数据拆分(新增验证集) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) start = time.time() # 阶段一:使用早停法找到合适的树数量(使用默认参数) print("阶段一:使用早停法确定树的数量") base_model = RandomForestRegressor( random_state=40, warm_start=True # 启用增量训练 ) # 执行早停法 optimal_trees = incremental_training( model=base_model, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, init_trees=2, # 初始树的数量 step=2, # 每次增加的树数量 max_trees=100, # 最大树数量 patience=30 # 允许的停滞轮次 ) print(f"\n最佳树数量: {optimal_trees}") # 阶段二:网格搜索调优结构参数 print("\n阶段二:网格搜索调优其他超参数") param_grid = { 'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], } grid_search = GridSearchCV( estimator=RandomForestRegressor(n_estimators=optimal_trees, random_state=40), param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2 ) grid_search.fit(X_train, y_train) best_params = grid_search.best_params_ print("\n阶段二最佳参数:", best_params) # 最终模型训练 final_model = RandomForestRegressor( **best_params, n_estimators=optimal_trees, random_state=40 ).fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # 模型评估 test_pred = final_model.predict(X_test) print(f"\n最终模型性能:") print(f"测试集 R²: {r2_score(y_test, test_pred):.3f}") print(f"测试集 RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.3f}") # 模型保存 save_model(final_model, 'model_optimize\pkl', 'rf_model_') # 时间统计 end = time.time() print(f"\n总耗时: {end - start:.1f}秒")