Bläddra i källkod

绘制增量学习性能图;
不同模型预测:GBSTR、KNN、RF、XGBR;
降酸模型数据导入

drggboy 8 månader sedan
förälder
incheckning
9716d51ef0

+ 120 - 0
api/model_optimize/GBSTR_filt.py

@@ -0,0 +1,120 @@
+from sklearn.ensemble import GradientBoostingRegressor as GBSTR
+from sklearn.model_selection import cross_val_score,cross_val_predict
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+# 导入数据
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')
+x = data.iloc[:,1:10]
+y = data.iloc[:,-1]
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+score_5cv_all = []
+for i in np.arange(0.01, 0.5, 0.01):
+    GBST = GBSTR(learning_rate=i)
+    score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "lr_5cv:{}".format(n_lr_5cv))
+
+# 随机种子
+score_5cv_all = []
+for i in range(0, 200, 1):
+    GBST =GBSTR(learning_rate=n_lr_5cv
+              ,random_state=i)
+    score_5cv =cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "random_state_5cv:{}".format(random_state_5cv))
+
+#随机树数目
+score_5cv_all = []
+for i in range(1, 400, 1):
+    GBST = GBSTR(n_estimators=i,
+               learning_rate=n_lr_5cv,
+               random_state=random_state_5cv)
+    score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "n_est_5cv:{}".format(n_est_5cv))
+
+
+score_5cv_all = []
+for i in range(1, 300, 1):
+    GBST = GBSTR(n_estimators=n_est_5cv,
+               learning_rate=n_lr_5cv, 
+               random_state=random_state_5cv, 
+               max_depth=i)
+    score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "max_depth_5cv:{}".format(max_depth_5cv))
+
+
+score_5cv_all = []
+for i in np.arange(0.01,1.0,0.05):
+    GBST = GBSTR(n_estimators=n_est_5cv,
+               learning_rate=n_lr_5cv, 
+               random_state=random_state_5cv,
+               max_depth=max_depth_5cv,
+               alpha=i)
+    score_5cv = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+max_alpha_5cv =  np.arange(0.01,1.0,0.05)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "alpha_5cv:{}".format(max_alpha_5cv))
+
+# 固定参数
+GBST = GBSTR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,alpha = max_alpha_5cv)
+CV_score = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+CV_predictions = cross_val_predict(GBST, Xtrain, Ytrain, cv=5)
+rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
+regressor = GBST.fit(Xtrain, Ytrain)
+test_predictions = regressor.predict(Xtest)
+score_test = regressor.score(Xtest,Ytest)
+rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
+print("5cv:",CV_score)
+print("rmse_5CV",rmse1)
+print("test:",score_test)
+print("rmse_test",rmse2)

+ 50 - 0
api/model_optimize/KNN_filt.py

@@ -0,0 +1,50 @@
+# 导入常用基本包
+import os
+import pandas as pd
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import matplotlib
+from mpl_toolkits.mplot3d import Axes3D 
+import matplotlib.cm as cm
+
+# 机器学习相关库
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, log_loss, roc_auc_score
+
+# 导入XGBoost
+from xgboost import XGBRegressor as XGBR
+
+# 其他工具
+import pickle
+from pathlib import Path
+
+# 导入数据
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')
+x = data.iloc[:,1:10]
+y = data.iloc[:,-1]
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+from sklearn.neighbors import KNeighborsRegressor
+KNN_model = KNeighborsRegressor(n_neighbors=10)
+r2_score=cross_val_score(KNN_model,Xtrain,Ytrain,cv=5)
+print(r2_score)
+print(r2_score.mean())

+ 93 - 0
api/model_optimize/RF_default.py

@@ -0,0 +1,93 @@
+## 导入常用基本包
+import os
+import pandas as pd
+import numpy as np
+from PIL import Image
+from model_saver import save_model
+
+# 机器学习模型导入
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import cross_val_score,cross_val_predict
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+
+## 导入常用辅助函数
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import cross_val_predict
+
+## 导入数据处理函数
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+
+## 导入评分函数
+from sklearn.metrics import r2_score
+from sklearn.metrics import mean_squared_error 
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import log_loss
+from sklearn.metrics import roc_auc_score
+
+import pickle
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from pathlib import Path
+
+# 导入数据
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')
+x = data.iloc[:,1:10]
+y = data.iloc[:,-1]
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+## 原始模型
+model_path = Path('model_optimize\pkl\RF_filt.pkl')
+# 确保路径存在
+if model_path.exists():
+    with open(model_path, 'rb') as f:
+        rfc = pickle.load(f)
+
+CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
+CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
+rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
+regressor = rfc.fit(Xtrain, Ytrain)
+test_predictions = regressor.predict(Xtest)
+score_test = regressor.score(Xtest,Ytest)
+rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
+print("5cv_score:",CV_score)
+print("rmse_5CV",rmse1)
+print("R2_test:",score_test)
+print("rmse_test",rmse2)
+
+# 指定参数
+# 确定参数进行训练
+rfc = RandomForestRegressor(random_state=1)
+CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
+CV_predictions = cross_val_predict(rfc, Xtrain, Ytrain, cv=5)
+rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
+regressor = rfc.fit(Xtrain, Ytrain)
+test_predictions = regressor.predict(Xtest)
+score_test = regressor.score(Xtest,Ytest)
+rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
+print("5cv:",CV_score)
+print("rmse_5CV",rmse1)
+print("R2_test:",score_test)
+print("rmse_test",rmse2)
+

+ 35 - 17
api/model_optimize/RF_filt.py

@@ -34,27 +34,45 @@ from sklearn.metrics import log_loss
 from sklearn.metrics import roc_auc_score
 
 
-# 导入数据
-data=pd.read_excel('model_optimize\data\data_filt.xlsx')
-x = data.iloc[:,1:10]
-y = data.iloc[:,-1]
 
+# ## 土壤反酸筛选数据
+# data=pd.read_excel('model_optimize\data\data_filt.xlsx')   
+
+# x = data.iloc[:,1:10]
+# y = data.iloc[:,-1]
+
+# # 为 x 赋予列名
+# x.columns = [
+#     'organic_matter',        # OM g/kg
+#     'chloride',              # CL g/kg
+#     'cec',                   # CEC cmol/kg
+#     'h_concentration',       # H+ cmol/kg
+#     'hn',                    # HN mg/kg
+#     'al_concentration',      # Al3+ cmol/kg
+#     'free_alumina',          # Free alumina g/kg
+#     'free_iron',             # Free iron oxides g/kg
+#     'delta_ph'               # ΔpH
+# ]
+
+# y.name = 'target_ph'
+
+## 精准降酸数据
+data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
+
+x = data.iloc[:,1:]
+y = data.iloc[:,0]
 # 为 x 赋予列名
 x.columns = [
-    'organic_matter',        # OM g/kg
-    'chloride',              # CL g/kg
-    'cec',                   # CEC cmol/kg
-    'h_concentration',       # H+ cmol/kg
-    'hn',                    # HN mg/kg
-    'al_concentration',      # Al3+ cmol/kg
-    'free_alumina',          # Free alumina g/kg
-    'free_iron',             # Free iron oxides g/kg
-    'delta_ph'               # ΔpH
+    'pH',        
+    'OM',              
+    'CL', 
+    'H',
+    'Al'
 ]
+y.name = 'target'
 
-y.name = 'target_ph'
 
-Xtrain, Xtest, Ytrain, Ytest=train_test_split(x, y, test_size=0.2)
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
 
 # 筛选随机种子
 score_5cv_all = []
@@ -117,7 +135,7 @@ print("test:",score_test)
 print("rmse_test",rmse2)
 
 # 保存训练好的模型
-custom_path='model_optimize\pkl'         # 模型保存路径
-prefix='rf_model_'          # 模型文件名前缀
+custom_path='model_optimize\pkl'            # 模型保存路径
+prefix='rf_model_'                          # 模型文件名前缀
 save_model(rfc, custom_path, prefix)
 

+ 148 - 0
api/model_optimize/XGBR_filt.py

@@ -0,0 +1,148 @@
+# 导入常用基本包
+import os
+import pandas as pd
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import matplotlib
+from mpl_toolkits.mplot3d import Axes3D 
+import matplotlib.cm as cm
+
+# 机器学习相关库
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, log_loss, roc_auc_score
+
+# 导入XGBoost
+from xgboost import XGBRegressor as XGBR
+
+# 其他工具
+import pickle
+from pathlib import Path
+
+# 导入数据
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')
+x = data.iloc[:,1:10]
+y = data.iloc[:,-1]
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+# 
+score_5cv_all = []
+for i in np.arange(0.01, 0.5, 0.01):
+    XGB = XGBR(learning_rate=i)
+    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+n_lr_5cv = np.arange(0.01,0.5,0.01)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "n_lr_5cv:{}".format(n_lr_5cv))
+
+# 随机种子
+score_5cv_all = []
+for i in range(0, 200, 1):
+    XGB =XGBR(learning_rate=n_lr_5cv
+              ,random_state=i)
+    score_5cv =cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+random_state_5cv = range(0, 200)[score_5cv_all.index(max(score_5cv_all))]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "random_5cv:{}".format(random_state_5cv))
+
+#随机树数目
+score_5cv_all = []
+for i in range(1, 400, 1):
+    XGB = XGBR(n_estimators=i,
+               learning_rate=n_lr_5cv,
+               random_state=random_state_5cv)
+    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+n_est_5cv = range(1,400)[score_5cv_all.index(score_max_5cv)]
+
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "n_est_5cv:{}".format(n_est_5cv))
+
+score_5cv_all = []
+for i in range(1, 300, 1):
+    XGB = XGBR(n_estimators=n_est_5cv,
+               learning_rate=n_lr_5cv, 
+               random_state=random_state_5cv, 
+               max_depth=i)
+    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+max_depth_5cv = range(1,300)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "max_depth_5cv:{}".format(max_depth_5cv))
+
+score_5cv_all = []
+for i in np.arange(0,5,0.05):
+    XGB = XGBR(n_estimators=n_est_5cv,
+               learning_rate=n_lr_5cv,
+               random_state=random_state_5cv,
+               max_depth=max_depth_5cv,
+               gamma= i)
+    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+max_gamma_5cv =  np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "gamma_5cv:{}".format(max_gamma_5cv))
+
+score_5cv_all = []
+for i in np.arange(0,5,0.05):
+    XGB = XGBR(n_estimators=n_est_5cv,
+               learning_rate=n_lr_5cv, 
+               random_state=random_state_5cv,
+               max_depth=max_depth_5cv,
+               gamma=max_gamma_5cv,
+               alpha=i)
+    score_5cv = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    score_5cv_all.append(score_5cv)
+    pass
+score_max_5cv = max(score_5cv_all)
+max_alpha_5cv =  np.arange(0,5,0.05)[score_5cv_all.index(score_max_5cv)]
+print(
+      "最大5cv得分:{}".format(score_max_5cv),
+      "alpha_5cv:{}".format(max_alpha_5cv))
+
+XGB = XGBR(learning_rate=n_lr_5cv,n_estimators=n_est_5cv,random_state=random_state_5cv,max_depth=max_depth_5cv,gamma =max_gamma_5cv,alpha = max_alpha_5cv)
+CV_score = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+CV_predictions = cross_val_predict(XGB, Xtrain, Ytrain, cv=5)
+rmse1 = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
+regressor = XGB.fit(Xtrain, Ytrain)
+test_predictions = regressor.predict(Xtest)
+score_test = regressor.score(Xtest,Ytest)
+rmse2 = np.sqrt(mean_squared_error(Ytest,test_predictions))
+print("5cv:",CV_score)
+print("rmse_5CV",rmse1)
+print("test:",score_test)
+print("rmse_test",rmse2)

BIN
api/model_optimize/data/Acidity_reduce.xlsx


+ 177 - 0
api/model_optimize/data_increase.py

@@ -0,0 +1,177 @@
+# 导入常用包
+import os
+import pandas as pd
+import numpy as np
+from PIL import Image
+from model_saver import save_model
+
+# 机器学习模型导入
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from sklearn.ensemble import GradientBoostingRegressor as GBSTR
+from sklearn.neighbors import KNeighborsRegressor
+from xgboost import XGBRegressor as XGBR
+
+# 导入数据处理函数
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+
+# 导入评分函数
+from sklearn.metrics import r2_score
+from sklearn.metrics import mean_squared_error 
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import log_loss
+from sklearn.metrics import roc_auc_score
+
+import pickle
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from pathlib import Path
+
+## 土壤反酸筛选数据
+data=pd.read_excel('model_optimize\data\data_filt.xlsx')   
+
+x = data.iloc[:,1:10]
+print(x)
+# x = data.iloc[:,1:9]
+y = data.iloc[:,-1]
+
+y1 = data.iloc[:,-2]
+y2 = data.iloc[:,-1]
+# 绝对值
+y = abs(y1 - y2)
+
+print(y)
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+# ## 精准降酸数据
+# data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
+
+# x = data.iloc[:,1:]
+# y = data.iloc[:,0]
+# # 为 x 赋予列名
+# x.columns = [
+#     'pH',        
+#     'OM',              
+#     'CL', 
+#     'H',
+#     'Al'
+# ]
+# y.name = 'target'
+
+
+## 数据集划分
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+## 模型
+# 随机森林回归模型
+rfc = RandomForestRegressor(random_state=1)
+# XGBR
+XGB = XGBR(random_state=1)
+# GBSTR
+GBST = GBSTR(random_state=1)
+# KNN
+KNN = KNeighborsRegressor(n_neighbors=4)
+
+# 增量训练:每次增加10%的训练数据
+increment = 0.1  # 每次增加的比例
+train_sizes = np.arange(0.1, 1.1, increment)  # 从0.1到1.0的训练集大小
+r2_scores_rfc = []  
+r2_scores_xgb = []  
+r2_scores_gbst = []  
+r2_scores_knn = []  # 用来记录KNN的r2_score
+
+
+# 对于每种训练集大小,训练模型并记录r2_score
+for size in train_sizes:
+    # 计算当前训练集的大小
+    current_size = int(size * len(Xtrain))
+    
+    # RandomForestRegressor
+    rfc.fit(Xtrain[:current_size], Ytrain[:current_size])
+    # XGBRegressor
+    XGB.fit(Xtrain[:current_size], Ytrain[:current_size])
+    # GBST
+    GBST.fit(Xtrain[:current_size], Ytrain[:current_size])
+    # KNN
+    KNN.fit(Xtrain[:current_size], Ytrain[:current_size])
+
+    # r2_score
+    y_pred_rfc = rfc.predict(Xtest)
+    score_rfc = r2_score(Ytest, y_pred_rfc)
+    r2_scores_rfc.append(score_rfc)
+    
+    # XGBRegressor的r2_score
+    y_pred_xgb = XGB.predict(Xtest)
+    score_xgb = r2_score(Ytest, y_pred_xgb)
+    r2_scores_xgb.append(score_xgb)
+
+    # GBST的r2_score
+    y_pred_gbst = GBST.predict(Xtest)
+    score_gbst = r2_score(Ytest, y_pred_gbst)
+    r2_scores_gbst.append(score_gbst)
+
+    # KNN的r2_score
+    y_pred_knn = KNN.predict(Xtest)
+    score_knn = r2_score(Ytest, y_pred_knn)
+    r2_scores_knn.append(score_knn)
+
+    # 输出当前的训练进度与r2评分
+    print(f"Training with {size * 100:.2f}% of the data:")
+    print(f"  - Random Forest R2 score: {score_rfc}")
+    print(f"  - XGB R2 score: {score_xgb}")
+    print(f"  - GBST R2 score: {score_gbst}")
+    print(f"  - KNN R2 score: {score_knn}")
+
+# 绘制R2评分随训练数据大小变化的图形
+import matplotlib.pyplot as plt
+
+plt.plot(train_sizes * 100, r2_scores_rfc, marker='o', label='Random Forest')
+plt.plot(train_sizes * 100, r2_scores_xgb, marker='x', label='XGBoost')
+plt.plot(train_sizes * 100, r2_scores_gbst, marker='s', label='Gradient Boosting')
+# plt.plot(train_sizes * 100, r2_scores_knn, marker='^', label='KNN')
+
+
+plt.xlabel('Training data size (%)')
+plt.ylabel('R2 Score')
+plt.title('Model Performance with Incremental Data')
+plt.legend()
+plt.grid(True)
+plt.show()
+
+
+# 选择后半部分数据
+# import matplotlib.pyplot as plt
+
+# # 选择后半部分数据
+# half_index = len(train_sizes) // 2
+
+# # 绘制后半部分的数据
+# plt.plot(train_sizes[half_index:] * 100, r2_scores_rfc[half_index:], marker='o', label='Random Forest')
+# plt.plot(train_sizes[half_index:] * 100, r2_scores_xgb[half_index:], marker='x', label='XGBoost')
+# plt.plot(train_sizes[half_index:] * 100, r2_scores_gbst[half_index:], marker='s', label='Gradient Boosting')
+# plt.plot(train_sizes[half_index:] * 100, r2_scores_knn[half_index:], marker='^', label='KNN')
+
+# plt.xlabel('Training data size (%)')
+# plt.ylabel('R2 Score')
+# plt.title('Model Performance with Incremental Data (Second Half)')
+# plt.legend()
+# plt.grid(True)
+# plt.show()

+ 123 - 0
api/model_optimize/data_increase_5cv_score.py

@@ -0,0 +1,123 @@
+# 导入常用包
+import os
+import pandas as pd
+import numpy as np
+from PIL import Image
+from model_saver import save_model
+
+# 机器学习模型导入
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from sklearn.ensemble import GradientBoostingRegressor as GBSTR
+from sklearn.neighbors import KNeighborsRegressor
+from xgboost import XGBRegressor as XGBR
+
+
+
+# 导入数据处理函数
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+
+# 导入评分函数
+from sklearn.metrics import r2_score
+from sklearn.metrics import mean_squared_error 
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import log_loss
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import cross_val_score
+
+import pickle
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from pathlib import Path
+
+
+
+# 导入数据
+data = pd.read_excel('model_optimize/data/data_filt.xlsx')
+x = data.iloc[:, 1:10]
+y = data.iloc[:, -1]
+
+# 为 x 赋予列名
+x.columns = [
+    'organic_matter',        # OM g/kg
+    'chloride',              # CL g/kg
+    'cec',                   # CEC cmol/kg
+    'h_concentration',       # H+ cmol/kg
+    'hn',                    # HN mg/kg
+    'al_concentration',      # Al3+ cmol/kg
+    'free_alumina',          # Free alumina g/kg
+    'free_iron',             # Free iron oxides g/kg
+    'delta_ph'               # ΔpH
+]
+
+y.name = 'target_ph'
+
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
+
+## 模型
+# 随机森林回归模型
+rfc = RandomForestRegressor(random_state=1)
+# XGBR
+XGB = XGBR(random_state=1)
+# GBSTR
+GBST = GBSTR(random_state=1)
+# KNN
+KNN = KNeighborsRegressor(n_neighbors=5)
+
+# 增量训练:每次增加10%的训练数据
+increment = 0.1  # 每次增加的比例
+train_sizes = np.arange(0.1, 1.1, increment)  # 从0.1到1.0的训练集大小
+# 记录各模型交叉验证评分
+cv5_scores_rfc = []  
+cv5_scores_xgb = []  
+cv5_scores_gbst = []  
+cv5_scores_knn = []  
+
+
+# 对于每种训练集大小,训练模型并记录r2_score
+for size in train_sizes:
+    # 计算当前训练集的大小
+    current_size = int(size * len(Xtrain))
+    
+    # 交叉验证评分
+    score_rfc = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
+    cv5_scores_rfc.append(score_rfc)
+
+    # XGBRegressor的交叉验证评分
+    score_xgb = cross_val_score(XGB, Xtrain, Ytrain, cv=5).mean()
+    cv5_scores_xgb.append(score_xgb)
+
+    # GBST的交叉验证评分
+    score_gbst = cross_val_score(GBST, Xtrain, Ytrain, cv=5).mean()
+    cv5_scores_gbst.append(score_gbst)
+
+    # KNN的交叉验证评分
+    score_knn = cross_val_score(KNN, Xtrain, Ytrain, cv=5).mean()
+    cv5_scores_knn.append(score_knn)
+
+    # 输出当前的训练进度与评分
+    print(f"Training with {size * 100:.2f}% of the data:")
+    print(f"  - Random Forest CV5 score: {score_rfc}")
+    print(f"  - XGB CV5 score: {score_xgb}")
+    print(f"  - GBST CV5 score: {score_gbst}")
+    print(f"  - KNN CV5 score: {score_knn}")
+
+# 绘制R2评分随训练数据大小变化的图形
+import matplotlib.pyplot as plt
+
+plt.plot(train_sizes * 100, cv5_scores_rfc, marker='o', label='Random Forest')
+plt.plot(train_sizes * 100, cv5_scores_xgb, marker='x', label='XGBoost')
+plt.plot(train_sizes * 100, cv5_scores_gbst, marker='s', label='Gradient Boosting')
+plt.plot(train_sizes * 100, cv5_scores_knn, marker='^', label='KNN')
+
+
+plt.xlabel('Training data size (%)')
+plt.ylabel('CV5 Score')
+plt.title('Model Performance with Incremental Data')
+plt.legend()
+plt.grid(True)
+plt.show()