|
@@ -3,100 +3,136 @@ import os
|
|
|
import pickle
|
|
|
import pandas as pd
|
|
|
from flask_sqlalchemy.session import Session
|
|
|
-from sklearn.ensemble import RandomForestRegressor
|
|
|
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
|
+from sklearn.metrics import r2_score
|
|
|
from sklearn.model_selection import train_test_split, cross_val_score
|
|
|
from sqlalchemy import text
|
|
|
+from xgboost import XGBRegressor
|
|
|
|
|
|
from .database_models import Models, Datasets
|
|
|
|
|
|
|
|
|
-
|
|
|
# 加载模型
|
|
|
-def load_model(model_name):
|
|
|
- file_path = f'model_optimize/pkl/{model_name}.pkl'
|
|
|
- with open(file_path, 'rb') as f:
|
|
|
+def load_model(session, model_id):
|
|
|
+ model = session.query(Models).filter(Models.ModelID == model_id).first()
|
|
|
+
|
|
|
+ if not model:
|
|
|
+ raise ValueError(f"Model with ID {model_id} not found.")
|
|
|
+ with open(model.ModelFilePath, 'rb') as f:
|
|
|
return pickle.load(f)
|
|
|
|
|
|
|
|
|
# 模型预测
|
|
|
-def predict(input_data: pd.DataFrame, model_name):
|
|
|
+def predict(session, input_data: pd.DataFrame, model_id):
|
|
|
# 初始化模型
|
|
|
- model = load_model(model_name) # 根据指定的模型名加载模型
|
|
|
- predictions = model.predict(input_data)
|
|
|
+ ML_model = load_model(session, model_id) # 根据指定的模型名加载模型
|
|
|
+ # model = load_model(model_id) # 根据指定的模型名加载模型
|
|
|
+ predictions = ML_model.predict(input_data)
|
|
|
return predictions.tolist()
|
|
|
|
|
|
+# 计算模型评分
|
|
|
+def calculate_model_score(model_info):
|
|
|
+ # 加载模型
|
|
|
+ with open(model_info.ModelFilePath, 'rb') as f:
|
|
|
+ ML_model = pickle.load(f)
|
|
|
+ # print("Model requires the following features:", model.feature_names_in_)
|
|
|
+ # 数据准备
|
|
|
+ if model_info.Data_type == 'reflux': # 反酸数据集
|
|
|
+ # 加载保存的 X_test 和 Y_test
|
|
|
+ X_test = pd.read_csv('uploads/data/X_test_reflux.csv')
|
|
|
+ Y_test = pd.read_csv('uploads/data/Y_test_reflux.csv')
|
|
|
+ print(X_test.columns) # 在测试时使用的数据的列名
|
|
|
+ y_pred = ML_model.predict(X_test)
|
|
|
+ elif model_info.Data_type == 'reduce': # 降酸数据集
|
|
|
+ # 加载保存的 X_test 和 Y_test
|
|
|
+ X_test = pd.read_csv('uploads/data/X_test_reduce.csv')
|
|
|
+ Y_test = pd.read_csv('uploads/data/Y_test_reduce.csv')
|
|
|
+ print(X_test.columns) # 在测试时使用的数据的列名
|
|
|
+ y_pred = ML_model.predict(X_test)
|
|
|
+
|
|
|
+
|
|
|
+ # 计算 R² 分数
|
|
|
+ r2 = r2_score(Y_test, y_pred)
|
|
|
+ return r2
|
|
|
|
|
|
|
|
|
def train_and_save_model(session, model_type, model_name, model_description, data_type, dataset_id=None):
|
|
|
- if not dataset_id:
|
|
|
- # 直接创建新的数据集并复制数据
|
|
|
- dataset_id = save_current_dataset(session, data_type)
|
|
|
+ try:
|
|
|
+ if not dataset_id:
|
|
|
+ # 创建新的数据集并复制数据,此过程将不立即提交
|
|
|
+ dataset_id = save_current_dataset(session, data_type, commit=False)
|
|
|
|
|
|
- # 从新复制的数据集表中加载数据
|
|
|
- dataset_table_name = f"dataset_{dataset_id}"
|
|
|
- dataset = pd.read_sql_table(dataset_table_name, session.bind)
|
|
|
+ if data_type == 'reflux':
|
|
|
+ current_table = 'current_reflux'
|
|
|
+ elif data_type == 'reduce':
|
|
|
+ current_table = 'current_reduce'
|
|
|
|
|
|
- if dataset.empty:
|
|
|
- raise ValueError(f"Dataset {dataset_id} is empty or not found.")
|
|
|
+ # 从current数据集表中加载数据
|
|
|
+ dataset = pd.read_sql_table(current_table, session.bind)
|
|
|
|
|
|
- # 数据准备
|
|
|
- X = dataset.iloc[:, :-1]
|
|
|
- y = dataset.iloc[:, -1]
|
|
|
+ elif dataset_id:
|
|
|
+ # 从新复制的数据集表中加载数据
|
|
|
+ dataset_table_name = f"dataset_{dataset_id}"
|
|
|
+ dataset = pd.read_sql_table(dataset_table_name, session.bind)
|
|
|
|
|
|
- # 训练模型
|
|
|
- model = train_model_by_type(X, y, model_type)
|
|
|
+ if dataset.empty:
|
|
|
+ raise ValueError(f"Dataset {dataset_id} is empty or not found.")
|
|
|
|
|
|
- # 保存模型到数据库
|
|
|
- save_model(session, model, model_name, model_type, model_description, dataset_id, data_type)
|
|
|
+ if data_type == 'reflux':
|
|
|
+ X = dataset.iloc[:, 1:-1]
|
|
|
+ y = dataset.iloc[:, -1]
|
|
|
+ elif data_type == 'reduce':
|
|
|
+ X = dataset.iloc[:, 2:]
|
|
|
+ y = dataset.iloc[:, 1]
|
|
|
|
|
|
+ # 训练模型
|
|
|
+ model = train_model_by_type(X, y, model_type)
|
|
|
|
|
|
-# # 保存模型参数
|
|
|
- # save_model_parameters(model, saved_model.ModelID)
|
|
|
+ # 保存模型到数据库
|
|
|
+ model_id = save_model(session, model, model_name, model_type, model_description, dataset_id, data_type)
|
|
|
+
|
|
|
+ # 所有操作成功后,手动提交事务
|
|
|
+ session.commit()
|
|
|
+ return model_name, model_id
|
|
|
+ except Exception as e:
|
|
|
+ # 如果在任何阶段出现异常,回滚事务
|
|
|
+ session.rollback()
|
|
|
+ raise e # 可选择重新抛出异常或处理异常
|
|
|
|
|
|
- # # 计算评估指标(如MSE)
|
|
|
- # y_pred = model.predict(X)
|
|
|
- # mse = mean_squared_error(y, y_pred)
|
|
|
- #
|
|
|
- # return saved_model, mse
|
|
|
|
|
|
-def save_current_dataset(session, data_type):
|
|
|
+
|
|
|
+def save_current_dataset(session, data_type, commit=True):
|
|
|
"""
|
|
|
- 创建一个新的数据集条目,并复制对应的数据类型表的数据。
|
|
|
+ 创建一个新的数据集条目,并复制对应的数据类型表的数据,但不立即提交事务。
|
|
|
|
|
|
Args:
|
|
|
session (Session): SQLAlchemy session对象。
|
|
|
- data_type (str): 数据集的类型,如 'reduce' 或 'reflux'。
|
|
|
+ data_type (str): 数据集的类型。
|
|
|
+ commit (bool): 是否在函数结束时提交事务。
|
|
|
|
|
|
Returns:
|
|
|
int: 新保存的数据集的ID。
|
|
|
"""
|
|
|
- # 创建一个新的数据集条目
|
|
|
new_dataset = Datasets(
|
|
|
- Dataset_name=f"{data_type}_dataset_{datetime.datetime.now():%Y%m%d_%H%M%S}", # 使用当前时间戳生成独特的名称
|
|
|
+ Dataset_name=f"{data_type}_dataset_{datetime.datetime.now():%Y%m%d_%H%M%S}",
|
|
|
Dataset_description=f"Automatically generated dataset for type {data_type}",
|
|
|
- Row_count=0, # 初始行数为0,将在复制数据后更新
|
|
|
- Status='pending', # 初始状态为pending
|
|
|
+ Row_count=0,
|
|
|
+ Status='pending',
|
|
|
Dataset_type=data_type
|
|
|
)
|
|
|
|
|
|
- # 添加到数据库并提交以获取ID
|
|
|
session.add(new_dataset)
|
|
|
- session.flush() # flush用于立即执行SQL并获取ID,但不提交事务
|
|
|
+ session.flush()
|
|
|
|
|
|
- # 获取新数据集的ID
|
|
|
dataset_id = new_dataset.Dataset_ID
|
|
|
-
|
|
|
- # 复制数据到新表
|
|
|
- source_table = data_type_table_mapping(data_type) # 假设有函数映射数据类型到表名
|
|
|
+ source_table = data_type_table_mapping(data_type)
|
|
|
new_table_name = f"dataset_{dataset_id}"
|
|
|
- copy_table_sql = f"CREATE TABLE {new_table_name} AS SELECT * FROM {source_table};"
|
|
|
- session.execute(text(copy_table_sql))
|
|
|
+ session.execute(text(f"CREATE TABLE {new_table_name} AS SELECT * FROM {source_table};"))
|
|
|
|
|
|
- # 更新新数据集的状态和行数
|
|
|
- update_sql = f"UPDATE datasets SET status='processed', row_count=(SELECT count(*) FROM {new_table_name}) WHERE dataset_id={dataset_id};"
|
|
|
- session.execute(text(update_sql))
|
|
|
+ session.execute(text(f"UPDATE datasets SET status='processed', row_count=(SELECT count(*) FROM {new_table_name}) WHERE dataset_id={dataset_id};"))
|
|
|
|
|
|
- session.commit()
|
|
|
+ if commit:
|
|
|
+ session.commit()
|
|
|
|
|
|
return dataset_id
|
|
|
|
|
@@ -128,7 +164,7 @@ def train_model_by_type(X, y, model_type):
|
|
|
|
|
|
|
|
|
def train_random_forest(X_train, y_train):
|
|
|
- best_score = 0
|
|
|
+ best_score = -float('inf')
|
|
|
best_n_estimators = None
|
|
|
best_max_depth = None
|
|
|
random_state = 43
|
|
@@ -145,7 +181,7 @@ def train_random_forest(X_train, y_train):
|
|
|
|
|
|
# 在找到的最佳树的数量基础上,筛选最佳的最大深度
|
|
|
best_score = 0 # 重置最佳得分,为最大深度优化做准备
|
|
|
- for max_depth in range(1, 30, 1):
|
|
|
+ for max_depth in range(1, 5, 1):
|
|
|
model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state)
|
|
|
score = cross_val_score(model, X_train, y_train, cv=5).mean()
|
|
|
if score > best_score:
|
|
@@ -157,25 +193,64 @@ def train_random_forest(X_train, y_train):
|
|
|
# 使用最佳的树的数量和最大深度训练最终模型
|
|
|
best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth,
|
|
|
random_state=random_state)
|
|
|
+
|
|
|
+ # 传入列名进行训练
|
|
|
+ best_model.fit(X_train, y_train)
|
|
|
+ # 指定传入的特征名
|
|
|
+ best_model.feature_names_in_ = X_train.columns
|
|
|
+ return best_model
|
|
|
+
|
|
|
+
|
|
|
+def train_xgboost(X_train, y_train):
|
|
|
+ best_score = -float('inf')
|
|
|
+ best_params = {'learning_rate': None, 'max_depth': None}
|
|
|
+ random_state = 43
|
|
|
+
|
|
|
+ for learning_rate in [0.01, 0.05, 0.1, 0.2]:
|
|
|
+ for max_depth in range(3, 10):
|
|
|
+ model = XGBRegressor(learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
|
|
|
+ score = cross_val_score(model, X_train, y_train, cv=5).mean()
|
|
|
+ if score > best_score:
|
|
|
+ best_score = score
|
|
|
+ best_params['learning_rate'] = learning_rate
|
|
|
+ best_params['max_depth'] = max_depth
|
|
|
+
|
|
|
+ print(f"Best parameters: {best_params}, Score: {best_score}")
|
|
|
+
|
|
|
+ # 使用找到的最佳参数训练最终模型
|
|
|
+ best_model = XGBRegressor(learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'],
|
|
|
+ random_state=random_state)
|
|
|
best_model.fit(X_train, y_train)
|
|
|
|
|
|
return best_model
|
|
|
|
|
|
|
|
|
-def train_xgboost(X_train, y_train, X_test, y_test):
|
|
|
- # XGBoost训练过程
|
|
|
- # (将类似上面的代码添加到这里)
|
|
|
- pass
|
|
|
+def train_gradient_boosting(X_train, y_train):
|
|
|
+ best_score = -float('inf')
|
|
|
+ best_params = {'learning_rate': None, 'max_depth': None}
|
|
|
+ random_state = 43
|
|
|
|
|
|
+ for learning_rate in [0.01, 0.05, 0.1, 0.2]:
|
|
|
+ for max_depth in range(3, 10):
|
|
|
+ model = GradientBoostingRegressor(learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
|
|
|
+ score = cross_val_score(model, X_train, y_train, cv=5).mean()
|
|
|
+ if score > best_score:
|
|
|
+ best_score = score
|
|
|
+ best_params['learning_rate'] = learning_rate
|
|
|
+ best_params['max_depth'] = max_depth
|
|
|
|
|
|
-def train_gradient_boosting(X_train, y_train, X_test, y_test):
|
|
|
- # 梯度提升树训练过程
|
|
|
- # (将类似上面的代码添加到这里)
|
|
|
- pass
|
|
|
+ print(f"Best parameters: {best_params}, Score: {best_score}")
|
|
|
|
|
|
-def save_model(session, model, model_name, model_type, model_description, dataset_id, data_type, custom_path='pkl'):
|
|
|
+ # 使用找到的最佳参数训练最终模型
|
|
|
+ best_model = GradientBoostingRegressor(learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'],
|
|
|
+ random_state=random_state)
|
|
|
+ best_model.fit(X_train, y_train)
|
|
|
+
|
|
|
+ return best_model
|
|
|
+
|
|
|
+def save_model(session, model, model_name, model_type, model_description, dataset_id, data_type, custom_path='pkl', commit=False):
|
|
|
"""
|
|
|
- 保存模型到数据库,并将模型文件保存到磁盘。
|
|
|
+ 保存模型到数据库,并将模型文件保存到磁盘,但不立即提交事务。
|
|
|
|
|
|
:param session: 数据库会话
|
|
|
:param model: 要保存的模型对象
|
|
@@ -184,21 +259,21 @@ def save_model(session, model, model_name, model_type, model_description, datase
|
|
|
:param model_description: 模型的描述信息
|
|
|
:param dataset_id: 数据集ID
|
|
|
:param custom_path: 保存模型的路径
|
|
|
+ :param commit: 是否提交事务
|
|
|
:return: 返回保存的模型文件路径
|
|
|
"""
|
|
|
- # 根据模型类型设置文件名前缀
|
|
|
prefix_dict = {
|
|
|
'RandomForest': 'rf_model_',
|
|
|
'XGBRegressor': 'xgbr_model_',
|
|
|
'GBSTRegressor': 'gbstr_model_'
|
|
|
}
|
|
|
- prefix = prefix_dict.get(model_type, 'default_model_') # 如果model_type不在字典中,默认前缀
|
|
|
+ prefix = prefix_dict.get(model_type, 'default_model_')
|
|
|
|
|
|
try:
|
|
|
# 确保路径存在
|
|
|
os.makedirs(custom_path, exist_ok=True)
|
|
|
|
|
|
- # 获取当前时间戳(格式:月日时分)
|
|
|
+ # 获取当前时间戳
|
|
|
timestamp = datetime.datetime.now().strftime('%m%d_%H%M')
|
|
|
|
|
|
# 拼接完整的文件名
|
|
@@ -222,15 +297,15 @@ def save_model(session, model, model_name, model_type, model_description, datase
|
|
|
|
|
|
# 添加记录到数据库
|
|
|
session.add(new_model)
|
|
|
- session.commit()
|
|
|
+ session.flush()
|
|
|
|
|
|
- # 返回文件路径
|
|
|
- return file_name
|
|
|
+ # 返回模型ID
|
|
|
+ return new_model.ModelID
|
|
|
|
|
|
except Exception as e:
|
|
|
- session.rollback()
|
|
|
print(f"Error saving model: {str(e)}")
|
|
|
- raise e # 显式抛出异常供调用者处理
|
|
|
+ raise
|
|
|
+
|
|
|
|
|
|
|
|
|
|