1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import pandas as pd
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.model_selection import train_test_split, cross_val_score
- from sklearn.metrics import mean_squared_error, r2_score
- from model_saver import save_model
- def train_random_forest(X_train, y_train):
- """
- 训练随机森林回归模型并优化超参数
-
- @param {DataFrame} X_train - 训练特征数据
- @param {Series} y_train - 训练目标数据
- @return {tuple} - 返回最佳模型和最佳参数(模型, 最佳树数量, 最佳深度)
- """
- best_score = -float('inf')
- best_n_estimators = None
- best_max_depth = None
- random_state = 40
- # 筛选最佳的树的数量
- for n_estimators in range(1, 20, 1):
- model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
- score = cross_val_score(model, X_train, y_train, cv=5).mean()
- if score > best_score:
- best_score = score
- best_n_estimators = n_estimators
- print(f"Best number of trees: {best_n_estimators}, Score: {best_score}")
- # 在找到的最佳树的数量基础上,筛选最佳的最大深度
- best_score = 0 # 重置最佳得分,为最大深度优化做准备
- for max_depth in range(1, 5, 1):
- model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state)
- score = cross_val_score(model, X_train, y_train, cv=5).mean()
- if score > best_score:
- best_score = score
- best_max_depth = max_depth
- print(f"Best max depth: {best_max_depth}, Score: {best_score}")
- # 使用最佳的树的数量和最大深度训练最终模型
- best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth,
- random_state=random_state)
- # 传入列名进行训练
- best_model.fit(X_train, y_train)
- # 指定传入的特征名
- best_model.feature_names_in_ = X_train.columns
- return best_model, best_n_estimators, best_max_depth
- # 统计筛选时间
- import time
- start = time.time()
- # 数据加载与划分
- data = pd.read_excel('model_optimize\data\Acidity_reduce_new.xlsx')
- x = data.iloc[:,1:]
- y = data.iloc[:,0]
- x.columns = ['pH', 'OM', 'CL', 'H', 'Al']
- y.name = 'target'
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
- # 最优模型评估
- best_model, best_n_estimators, best_max_depth = train_random_forest(Xtrain, Ytrain)
- train_pred = best_model.predict(Xtrain)
- test_pred = best_model.predict(Xtest)
- print(f"Best Params: {{'n_estimators': {best_n_estimators}, 'max_depth': {best_max_depth}, 'min_samples_split': 2, 'min_samples_leaf': 1}}")
- print(f"Train R²: {r2_score(Ytrain, train_pred):.3f}")
- print(f"Test R²: {r2_score(Ytest, test_pred):.3f}")
- print(f"Test RMSE: {mean_squared_error(Ytest, test_pred, squared=False):.3f}")
- # 模型保存
- save_model(best_model, 'model_optimize\pkl', 'rf_model_web_')
- # 统计筛选时间
- end = time.time()
- print(f"Time: {end - start:.3f}s")
|