123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- ## 导入常用基本包
- import os
- import pandas as pd
- import numpy as np
- from PIL import Image
- from model_saver import save_model
- # 机器学习模型导入
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.model_selection import cross_val_score,cross_val_predict
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error
- ## 导入常用辅助函数
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import GridSearchCV
- from sklearn.model_selection import cross_val_score
- from sklearn.model_selection import cross_val_predict
- ## 导入数据处理函数
- from sklearn.preprocessing import StandardScaler
- from sklearn.preprocessing import MinMaxScaler
- ## 导入评分函数
- from sklearn.metrics import r2_score
- from sklearn.metrics import mean_squared_error
- from sklearn.metrics import mean_absolute_error
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import log_loss
- from sklearn.metrics import roc_auc_score
- import pickle
- import pandas as pd
- import numpy as np
- from sklearn.metrics import mean_squared_error
- from pathlib import Path
- # 导入数据
- data=pd.read_excel('model_optimize\data\data_filt.xlsx')
- x = data.iloc[:,1:10]
- y = data.iloc[:,-1]
- # 为 x 赋予列名
- x.columns = [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg
- 'al_concentration', # Al3+ cmol/kg
- 'free_alumina', # Free alumina g/kg
- 'free_iron', # Free iron oxides g/kg
- 'delta_ph' # ΔpH
- ]
- y.name = 'target_ph'
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
- ## 原始模型
- model_path = Path('model_optimize\pkl\RF_filt.pkl')
- # 确保路径存在
- if model_path.exists():
- with open(model_path, 'rb') as f:
- rfc = pickle.load(f)
- CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
- CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
- rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
- regressor = rfc.fit(Xtrain, Ytrain)
- test_predictions = regressor.predict(Xtest)
- score_test = regressor.score(Xtest,Ytest)
- rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
- print("5cv_score:",CV_score)
- print("rmse_5CV",rmse1)
- print("R2_test:",score_test)
- print("rmse_test",rmse2)
- # 指定参数
- # 确定参数进行训练
- rfc = RandomForestRegressor(random_state=1)
- CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
- CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
- rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
- regressor = rfc.fit(Xtrain, Ytrain)
- test_predictions = regressor.predict(Xtest)
- score_test = regressor.score(Xtest,Ytest)
- rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
- print("5cv:",CV_score)
- print("rmse_5CV",rmse1)
- print("R2_test:",score_test)
- print("rmse_test",rmse2)
|