## 导入常用基本包
import os
import pandas as pd
import numpy as np
from PIL import Image
from model_saver import save_model

# 机器学习模型导入
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## 导入常用辅助函数
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

## 导入数据处理函数
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## 导入评分函数
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from pathlib import Path

# 导入数据
data=pd.read_excel('model_optimize\data\data_filt.xlsx')
x = data.iloc[:,1:10]
y = data.iloc[:,-1]

# 为 x 赋予列名
x.columns = [
    'organic_matter',        # OM g/kg
    'chloride',              # CL g/kg
    'cec',                   # CEC cmol/kg
    'h_concentration',       # H+ cmol/kg
    'hn',                    # HN mg/kg
    'al_concentration',      # Al3+ cmol/kg
    'free_alumina',          # Free alumina g/kg
    'free_iron',             # Free iron oxides g/kg
    'delta_ph'               # ΔpH
]

y.name = 'target_ph'

Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)

## 原始模型
model_path = Path('model_optimize\pkl\RF_filt.pkl')
# 确保路径存在
if model_path.exists():
    with open(model_path, 'rb') as f:
        rfc = pickle.load(f)

CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("5cv_score:",CV_score)
print("rmse_5CV",rmse1)
print("R2_test:",score_test)
print("rmse_test",rmse2)

# 指定参数
# 确定参数进行训练
rfc = RandomForestRegressor(random_state=1)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
regressor = rfc.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
print("5cv:",CV_score)
print("rmse_5CV",rmse1)
print("R2_test:",score_test)
print("rmse_test",rmse2)