123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253 |
- # 导入常用包
- import os
- import pandas as pd
- import numpy as np
- from PIL import Image
- from model_saver import save_model
- # 机器学习模型导入
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error
- from sklearn.ensemble import GradientBoostingRegressor as GBSTR
- from sklearn.neighbors import KNeighborsRegressor
- from xgboost import XGBRegressor as XGBR
- # 导入数据处理函数
- from sklearn.preprocessing import StandardScaler
- from sklearn.preprocessing import MinMaxScaler
- # 导入评分函数
- from sklearn.metrics import r2_score
- from sklearn.metrics import mean_squared_error
- from sklearn.metrics import mean_absolute_error
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import log_loss
- from sklearn.metrics import roc_auc_score
- import pickle
- import pandas as pd
- import numpy as np
- from sklearn.metrics import mean_squared_error
- from pathlib import Path
- # ## 土壤反酸筛选数据 64个样本 9个特征(包含delta_ph) 105_day_ph
- # data=pd.read_excel('model_optimize\data\data_filt.xlsx')
- # x = data.iloc[:,1:10]
- # print(x)
- # y = data.iloc[:,-1]
- # print(y)
- # # 为 x 赋予列名
- # x.columns = [
- # 'organic_matter', # OM g/kg
- # 'chloride', # CL g/kg
- # 'cec', # CEC cmol/kg
- # 'h_concentration', # H+ cmol/kg
- # 'hn', # HN mg/kg
- # 'al_concentration', # Al3+ cmol/kg
- # 'free_alumina', # Free alumina g/kg
- # 'free_iron', # Free iron oxides g/kg
- # 'delta_ph' # ΔpH
- # ]
- # y.name = 'target_ph'
- # ## 土壤反酸筛选数据 64个样本 8个特征 delta_ph
- # # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # 64个样本
- # data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx') # 60个样本(去除异常点)
- # x = data.iloc[:,1:9]
- # print(x)
- # y = data.iloc[:,-2]
- # print(y)
- # # 为 x 赋予列名
- # x.columns = [
- # 'organic_matter', # OM g/kg
- # 'chloride', # CL g/kg
- # 'cec', # CEC cmol/kg
- # 'h_concentration', # H+ cmol/kg
- # 'hn', # HN mg/kg
- # 'al_concentration', # Al3+ cmol/kg
- # 'free_alumina', # Free alumina g/kg
- # 'free_iron', # Free iron oxides g/kg
- # ]
- # y.name = 'target_ph'
- ## 土壤反酸筛选数据 最新数据:34个样本 6个特征 delta_ph
- data=pd.read_excel('model_optimize\data\data_reflux2.xlsx')
- x = data.iloc[:,0:6]
- print(x)
- y = data.iloc[:,-1]
- print(y)
- # 为 x 赋予列名
- x.columns = [
- 'organic_matter', # OM g/kg
- 'chloride', # CL g/kg
- 'cec', # CEC cmol/kg
- 'h_concentration', # H+ cmol/kg
- 'hn', # HN mg/kg ....
- 'al_concentration', # Al3+ cmol/kg
- ]
- y.name = 'delta_ph'
- # ## 精准降酸数据 54个样本 5个特征 1/b
- # data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
- # x = data.iloc[:,1:]
- # y = data.iloc[:,0]
- # # 为 x 赋予列名
- # x.columns = [
- # 'pH',
- # 'OM',
- # 'CL',
- # 'H',
- # 'Al'
- # ]
- # y.name = 'target'
- ## 数据集划分
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
- ## 模型
- # 随机森林回归模型
- rfc = RandomForestRegressor(random_state=1)
- # XGBR
- XGB = XGBR(random_state=1)
- # GBSTR
- GBST = GBSTR(random_state=1)
- # KNN
- KNN = KNeighborsRegressor(n_neighbors=2)
- # 增量训练:每次增加10%的训练数据
- increment = 0.1 # 每次增加的比例
- train_sizes = np.arange(0.1, 1.1, increment) # 从0.1到1.0的训练集大小
- r2_scores_rfc = []
- r2_scores_xgb = []
- r2_scores_gbst = []
- r2_scores_knn = [] # 用来记录KNN的r2_score
- # 对于每种训练集大小,训练模型并记录r2_score
- for size in train_sizes:
- # 计算当前训练集的大小
- current_size = int(size * len(Xtrain))
-
- # RandomForestRegressor
- rfc.fit(Xtrain[:current_size], Ytrain[:current_size])
- # XGBRegressor
- XGB.fit(Xtrain[:current_size], Ytrain[:current_size])
- # GBST
- GBST.fit(Xtrain[:current_size], Ytrain[:current_size])
- # KNN
- KNN.fit(Xtrain[:current_size], Ytrain[:current_size])
- # r2_score
- y_pred_rfc = rfc.predict(Xtest)
- score_rfc = r2_score(Ytest, y_pred_rfc)
- r2_scores_rfc.append(score_rfc)
-
- # XGBRegressor的r2_score
- y_pred_xgb = XGB.predict(Xtest)
- score_xgb = r2_score(Ytest, y_pred_xgb)
- r2_scores_xgb.append(score_xgb)
- # GBST的r2_score
- y_pred_gbst = GBST.predict(Xtest)
- score_gbst = r2_score(Ytest, y_pred_gbst)
- r2_scores_gbst.append(score_gbst)
- # KNN的r2_score
- y_pred_knn = KNN.predict(Xtest)
- score_knn = r2_score(Ytest, y_pred_knn)
- r2_scores_knn.append(score_knn)
- # 输出当前的训练进度与r2评分
- print(f"Training with {size * 100:.2f}% of the data:")
- print(f" - Random Forest R2 score: {score_rfc}")
- print(f" - XGB R2 score: {score_xgb}")
- print(f" - GBST R2 score: {score_gbst}")
- print(f" - KNN R2 score: {score_knn}")
- # 绘制R2评分随训练数据大小变化的图形
- import matplotlib.pyplot as plt
- plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest')
- plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost')
- plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting')
- # plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN')
- plt.xlabel('Training data size (%)')
- plt.ylabel('R2 Score')
- plt.title('Model Performance with Incremental Data')
- plt.legend()
- plt.grid(True)
- plt.show()
- # # 打印JavaScript中需要的数据格式
- # print("X轴数据(训练数据大小):", [f"{int(size * 100)}%" for size in train_sizes])
- # print("Random Forest R2分数:", r2_scores_rfc)
- # print("XGBoost R2分数:", r2_scores_xgb)
- # print("Gradient Boosting R2分数:", r2_scores_gbst)
- y_pred = rfc.predict(Xtest) # 使用任意一个模型,这里以随机森林为例
- residuals = Ytest - y_pred
- plt.scatter(Ytest, y_pred, color='blue', alpha=0.5)
- plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], color='red', linestyle='--') # 对角线 y=x
- plt.xlabel('True Values')
- plt.ylabel('Predicted Values')
- plt.title('True vs Predicted Values')
- plt.grid(True)
- plt.show()
- # # 生成 scatterData
- # scatter_data = [[float(true), float(pred)] for true, pred in zip(Ytest, y_pred)]
- # # 打印 scatterData(可直接复制到 JavaScript 代码中)
- # print("scatterData = ", scatter_data)
- # 保存 X_test 和 Y_test 为 CSV 文件
- X_test_df = pd.DataFrame(Xtest)
- Y_test_df = pd.DataFrame(Ytest)
- # 将 X_test 和 Y_test 保存为 CSV 文件,方便之后加载
- X_test_df.to_csv('X_test_reflux.csv', index=False)
- Y_test_df.to_csv('Y_test_reflux.csv', index=False)
- # 输出提示信息
- print("X_test 和 Y_test 已保存为 'X_test_reduce.csv' 和 'Y_test_reduce.csv'")
- # 选择后半部分数据
- # import matplotlib.pyplot as plt
- # # 选择后半部分数据
- # half_index = len(train_sizes) // 2
- # # 绘制后半部分的数据
- # plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest')
- # plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost')
- # plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting')
- # plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN')
- # plt.xlabel('Training data size (%)')
- # plt.ylabel('R2 Score')
- # plt.title('Model Performance with Incremental Data (Second Half)')
- # plt.legend()
- # plt.grid(True)
- # plt.show()
|