import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, r2_score from model_saver import save_model def train_random_forest(X_train, y_train): """ 训练随机森林回归模型并优化超参数 @param {DataFrame} X_train - 训练特征数据 @param {Series} y_train - 训练目标数据 @return {tuple} - 返回最佳模型和最佳参数(模型, 最佳树数量, 最佳深度) """ best_score = -float('inf') best_n_estimators = None best_max_depth = None random_state = 40 # 筛选最佳的树的数量 for n_estimators in range(1, 20, 1): model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state) score = cross_val_score(model, X_train, y_train, cv=5).mean() if score > best_score: best_score = score best_n_estimators = n_estimators print(f"Best number of trees: {best_n_estimators}, Score: {best_score}") # 在找到的最佳树的数量基础上,筛选最佳的最大深度 best_score = 0 # 重置最佳得分,为最大深度优化做准备 for max_depth in range(1, 5, 1): model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state) score = cross_val_score(model, X_train, y_train, cv=5).mean() if score > best_score: best_score = score best_max_depth = max_depth print(f"Best max depth: {best_max_depth}, Score: {best_score}") # 使用最佳的树的数量和最大深度训练最终模型 best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=random_state) # 传入列名进行训练 best_model.fit(X_train, y_train) # 指定传入的特征名 best_model.feature_names_in_ = X_train.columns return best_model, best_n_estimators, best_max_depth # 统计筛选时间 import time start = time.time() # 数据加载与划分 data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx') x = data.iloc[:,1:] y = data.iloc[:,0] x.columns = ['pH', 'OM', 'CL', 'H', 'Al'] y.name = 'target' Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42) # 最优模型评估 best_model, best_n_estimators, best_max_depth = train_random_forest(Xtrain, Ytrain) train_pred = best_model.predict(Xtrain) test_pred = best_model.predict(Xtest) print(f"Best Params: {{'n_estimators': {best_n_estimators}, 'max_depth': {best_max_depth}, 'min_samples_split': 2, 'min_samples_leaf': 1}}") print(f"Train R²: {r2_score(Ytrain, train_pred):.3f}") print(f"Test R²: {r2_score(Ytest, test_pred):.3f}") print(f"Test RMSE: {mean_squared_error(Ytest, test_pred, squared=False):.3f}") # 模型保存 save_model(best_model, 'model_optimize\pkl', 'rf_model_web_') # 统计筛选时间 end = time.time() print(f"Time: {end - start:.3f}s")