RF_web.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import pandas as pd
  2. from sklearn.ensemble import RandomForestRegressor
  3. from sklearn.model_selection import train_test_split, cross_val_score
  4. from sklearn.metrics import mean_squared_error, r2_score
  5. from model_saver import save_model
  6. def train_random_forest(X_train, y_train):
  7. """
  8. 训练随机森林回归模型并优化超参数
  9. @param {DataFrame} X_train - 训练特征数据
  10. @param {Series} y_train - 训练目标数据
  11. @return {tuple} - 返回最佳模型和最佳参数(模型, 最佳树数量, 最佳深度)
  12. """
  13. best_score = -float('inf')
  14. best_n_estimators = None
  15. best_max_depth = None
  16. random_state = 40
  17. # 筛选最佳的树的数量
  18. for n_estimators in range(1, 20, 1):
  19. model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
  20. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  21. if score > best_score:
  22. best_score = score
  23. best_n_estimators = n_estimators
  24. print(f"Best number of trees: {best_n_estimators}, Score: {best_score}")
  25. # 在找到的最佳树的数量基础上,筛选最佳的最大深度
  26. best_score = 0 # 重置最佳得分,为最大深度优化做准备
  27. for max_depth in range(1, 5, 1):
  28. model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state)
  29. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  30. if score > best_score:
  31. best_score = score
  32. best_max_depth = max_depth
  33. print(f"Best max depth: {best_max_depth}, Score: {best_score}")
  34. # 使用最佳的树的数量和最大深度训练最终模型
  35. best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth,
  36. random_state=random_state)
  37. # 传入列名进行训练
  38. best_model.fit(X_train, y_train)
  39. # 指定传入的特征名
  40. best_model.feature_names_in_ = X_train.columns
  41. return best_model, best_n_estimators, best_max_depth
  42. # 统计筛选时间
  43. import time
  44. start = time.time()
  45. # 数据加载与划分
  46. data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
  47. x = data.iloc[:,1:]
  48. y = data.iloc[:,0]
  49. x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
  50. y.name = 'target'
  51. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  52. # 最优模型评估
  53. best_model, best_n_estimators, best_max_depth = train_random_forest(Xtrain, Ytrain)
  54. train_pred = best_model.predict(Xtrain)
  55. test_pred = best_model.predict(Xtest)
  56. print(f"Best Params: {{'n_estimators': {best_n_estimators}, 'max_depth': {best_max_depth}, 'min_samples_split': 2, 'min_samples_leaf': 1}}")
  57. print(f"Train R²: {r2_score(Ytrain, train_pred):.3f}")
  58. print(f"Test R²: {r2_score(Ytest, test_pred):.3f}")
  59. print(f"Test RMSE: {mean_squared_error(Ytest, test_pred, squared=False):.3f}")
  60. # 模型保存
  61. save_model(best_model, 'model_optimize\pkl', 'rf_model_web_')
  62. # 统计筛选时间
  63. end = time.time()
  64. print(f"Time: {end - start:.3f}s")