RF_early.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.ensemble import RandomForestRegressor
  4. from sklearn.model_selection import GridSearchCV, train_test_split
  5. from sklearn.metrics import mean_squared_error, r2_score
  6. from model_saver import save_model
  7. import time
  8. # 早停法实现函数
  9. def incremental_training(model, X_train, y_train, X_val, y_val, init_trees=10, step=10, max_trees=500, patience=5):
  10. """
  11. 使用早停法增量训练随机森林模型
  12. @param {RandomForestRegressor} model - 随机森林模型
  13. @param {DataFrame} X_train - 训练特征
  14. @param {Series} y_train - 训练标签
  15. @param {DataFrame} X_val - 验证特征
  16. @param {Series} y_val - 验证标签
  17. @param {int} init_trees - 初始树数量
  18. @param {int} step - 每次增加的树数量
  19. @param {int} max_trees - 最大树数量
  20. @param {int} patience - 允许的停滞轮次
  21. @return {int} - 最佳树数量
  22. """
  23. best_score = -np.inf
  24. best_n = init_trees
  25. no_improve = 0
  26. # 初始化模型参数
  27. model.set_params(n_estimators=init_trees, warm_start=True)
  28. for n in range(init_trees, max_trees + step, step):
  29. model.n_estimators = n
  30. model.fit(X_train, y_train)
  31. current_score = model.score(X_val, y_val)
  32. if current_score > best_score:
  33. best_score = current_score
  34. best_n = n
  35. no_improve = 0
  36. else:
  37. no_improve += 1
  38. print(f"当前树数量: {n} | 验证集 R²: {current_score:.4f} | 最佳 R²: {best_score:.4f}")
  39. if no_improve >= patience:
  40. print(f"在 {n} 棵树时早停")
  41. break
  42. return best_n
  43. # 数据加载与预处理
  44. data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
  45. x = data.iloc[:,1:]
  46. y = data.iloc[:,0]
  47. x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
  48. y.name = 'target'
  49. # 数据拆分(新增验证集)
  50. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
  51. X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
  52. start = time.time()
  53. # 阶段一:使用早停法找到合适的树数量(使用默认参数)
  54. print("阶段一:使用早停法确定树的数量")
  55. base_model = RandomForestRegressor(
  56. random_state=40,
  57. warm_start=True # 启用增量训练
  58. )
  59. # 执行早停法
  60. optimal_trees = incremental_training(
  61. model=base_model,
  62. X_train=X_train,
  63. y_train=y_train,
  64. X_val=X_val,
  65. y_val=y_val,
  66. init_trees=2, # 初始树的数量
  67. step=2, # 每次增加的树数量
  68. max_trees=100, # 最大树数量
  69. patience=30 # 允许的停滞轮次
  70. )
  71. print(f"\n最佳树数量: {optimal_trees}")
  72. # 阶段二:网格搜索调优结构参数
  73. print("\n阶段二:网格搜索调优其他超参数")
  74. param_grid = {
  75. 'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
  76. 'min_samples_split': [2, 5],
  77. 'min_samples_leaf': [1, 2],
  78. }
  79. grid_search = GridSearchCV(
  80. estimator=RandomForestRegressor(n_estimators=optimal_trees, random_state=40),
  81. param_grid=param_grid,
  82. cv=5,
  83. scoring='r2',
  84. n_jobs=-1,
  85. verbose=2
  86. )
  87. grid_search.fit(X_train, y_train)
  88. best_params = grid_search.best_params_
  89. print("\n阶段二最佳参数:", best_params)
  90. # 最终模型训练
  91. final_model = RandomForestRegressor(
  92. **best_params,
  93. n_estimators=optimal_trees,
  94. random_state=40
  95. ).fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
  96. # 模型评估
  97. test_pred = final_model.predict(X_test)
  98. print(f"\n最终模型性能:")
  99. print(f"测试集 R²: {r2_score(y_test, test_pred):.3f}")
  100. print(f"测试集 RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.3f}")
  101. # 模型保存
  102. save_model(final_model, 'model_optimize\pkl', 'rf_model_')
  103. # 时间统计
  104. end = time.time()
  105. print(f"\n总耗时: {end - start:.1f}秒")