123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- import pandas as pd
- import numpy as np
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.model_selection import GridSearchCV, train_test_split
- from sklearn.metrics import mean_squared_error, r2_score
- from model_saver import save_model
- import time
- # 早停法实现函数
- def incremental_training(model, X_train, y_train, X_val, y_val, init_trees=10, step=10, max_trees=500, patience=5):
- """
- 使用早停法增量训练随机森林模型
-
- @param {RandomForestRegressor} model - 随机森林模型
- @param {DataFrame} X_train - 训练特征
- @param {Series} y_train - 训练标签
- @param {DataFrame} X_val - 验证特征
- @param {Series} y_val - 验证标签
- @param {int} init_trees - 初始树数量
- @param {int} step - 每次增加的树数量
- @param {int} max_trees - 最大树数量
- @param {int} patience - 允许的停滞轮次
- @return {int} - 最佳树数量
- """
- best_score = -np.inf
- best_n = init_trees
- no_improve = 0
-
- # 初始化模型参数
- model.set_params(n_estimators=init_trees, warm_start=True)
-
- for n in range(init_trees, max_trees + step, step):
- model.n_estimators = n
- model.fit(X_train, y_train)
- current_score = model.score(X_val, y_val)
-
- if current_score > best_score:
- best_score = current_score
- best_n = n
- no_improve = 0
- else:
- no_improve += 1
-
- print(f"当前树数量: {n} | 验证集 R²: {current_score:.4f} | 最佳 R²: {best_score:.4f}")
-
- if no_improve >= patience:
- print(f"在 {n} 棵树时早停")
- break
-
- return best_n
- # 数据加载与预处理
- data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
- x = data.iloc[:,1:]
- y = data.iloc[:,0]
- x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
- y.name = 'target'
- # 数据拆分(新增验证集)
- X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
- X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
- start = time.time()
- # 阶段一:使用早停法找到合适的树数量(使用默认参数)
- print("阶段一:使用早停法确定树的数量")
- base_model = RandomForestRegressor(
- random_state=40,
- warm_start=True # 启用增量训练
- )
- # 执行早停法
- optimal_trees = incremental_training(
- model=base_model,
- X_train=X_train,
- y_train=y_train,
- X_val=X_val,
- y_val=y_val,
- init_trees=2, # 初始树的数量
- step=2, # 每次增加的树数量
- max_trees=100, # 最大树数量
- patience=30 # 允许的停滞轮次
- )
- print(f"\n最佳树数量: {optimal_trees}")
- # 阶段二:网格搜索调优结构参数
- print("\n阶段二:网格搜索调优其他超参数")
- param_grid = {
- 'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
- 'min_samples_split': [2, 5],
- 'min_samples_leaf': [1, 2],
- }
- grid_search = GridSearchCV(
- estimator=RandomForestRegressor(n_estimators=optimal_trees, random_state=40),
- param_grid=param_grid,
- cv=5,
- scoring='r2',
- n_jobs=-1,
- verbose=2
- )
- grid_search.fit(X_train, y_train)
- best_params = grid_search.best_params_
- print("\n阶段二最佳参数:", best_params)
- # 最终模型训练
- final_model = RandomForestRegressor(
- **best_params,
- n_estimators=optimal_trees,
- random_state=40
- ).fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
- # 模型评估
- test_pred = final_model.predict(X_test)
- print(f"\n最终模型性能:")
- print(f"测试集 R²: {r2_score(y_test, test_pred):.3f}")
- print(f"测试集 RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.3f}")
- # 模型保存
- save_model(final_model, 'model_optimize\pkl', 'rf_model_')
- # 时间统计
- end = time.time()
- print(f"\n总耗时: {end - start:.1f}秒")
|