model.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. import datetime
  2. import os
  3. import pickle
  4. import pandas as pd
  5. from flask_sqlalchemy.session import Session
  6. from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
  7. from sklearn.metrics import r2_score
  8. from sklearn.model_selection import train_test_split, cross_val_score
  9. from sqlalchemy import text
  10. from xgboost import XGBRegressor
  11. from .database_models import Models, Datasets
  12. # 加载模型
  13. def load_model(session, model_id):
  14. model = session.query(Models).filter(Models.ModelID == model_id).first()
  15. if not model:
  16. raise ValueError(f"Model with ID {model_id} not found.")
  17. with open(model.ModelFilePath, 'rb') as f:
  18. return pickle.load(f)
  19. # 模型预测
  20. def predict(session, input_data: pd.DataFrame, model_id):
  21. # 初始化模型
  22. ML_model = load_model(session, model_id) # 根据指定的模型名加载模型
  23. # model = load_model(model_id) # 根据指定的模型名加载模型
  24. predictions = ML_model.predict(input_data)
  25. return predictions.tolist()
  26. # 计算模型评分
  27. def calculate_model_score(model_info):
  28. # 加载模型
  29. with open(model_info.ModelFilePath, 'rb') as f:
  30. ML_model = pickle.load(f)
  31. # print("Model requires the following features:", model.feature_names_in_)
  32. # 数据准备
  33. if model_info.Data_type == 'reflux': # 反酸数据集
  34. # 加载保存的 X_test 和 Y_test
  35. X_test = pd.read_csv('uploads/data/X_test_reflux.csv')
  36. Y_test = pd.read_csv('uploads/data/Y_test_reflux.csv')
  37. print(X_test.columns) # 在测试时使用的数据的列名
  38. y_pred = ML_model.predict(X_test)
  39. elif model_info.Data_type == 'reduce': # 降酸数据集
  40. # 加载保存的 X_test 和 Y_test
  41. X_test = pd.read_csv('uploads/data/X_test_reduce.csv')
  42. Y_test = pd.read_csv('uploads/data/Y_test_reduce.csv')
  43. print(X_test.columns) # 在测试时使用的数据的列名
  44. y_pred = ML_model.predict(X_test)
  45. # 计算 R² 分数
  46. r2 = r2_score(Y_test, y_pred)
  47. return r2
  48. def train_and_save_model(session, model_type, model_name, model_description, data_type, dataset_id=None):
  49. try:
  50. if not dataset_id:
  51. # 创建新的数据集并复制数据,此过程将不立即提交
  52. dataset_id = save_current_dataset(session, data_type, commit=False)
  53. if data_type == 'reflux':
  54. current_table = 'current_reflux'
  55. elif data_type == 'reduce':
  56. current_table = 'current_reduce'
  57. # 从current数据集表中加载数据
  58. dataset = pd.read_sql_table(current_table, session.bind)
  59. elif dataset_id:
  60. # 从新复制的数据集表中加载数据
  61. dataset_table_name = f"dataset_{dataset_id}"
  62. dataset = pd.read_sql_table(dataset_table_name, session.bind)
  63. if dataset.empty:
  64. raise ValueError(f"Dataset {dataset_id} is empty or not found.")
  65. if data_type == 'reflux':
  66. X = dataset.iloc[:, 1:-1]
  67. y = dataset.iloc[:, -1]
  68. elif data_type == 'reduce':
  69. X = dataset.iloc[:, 2:]
  70. y = dataset.iloc[:, 1]
  71. # 训练模型
  72. model = train_model_by_type(X, y, model_type)
  73. # 保存模型到数据库
  74. model_id = save_model(session, model, model_name, model_type, model_description, dataset_id, data_type)
  75. # 所有操作成功后,手动提交事务
  76. session.commit()
  77. return model_name, model_id
  78. except Exception as e:
  79. # 如果在任何阶段出现异常,回滚事务
  80. session.rollback()
  81. raise e # 可选择重新抛出异常或处理异常
  82. def save_current_dataset(session, data_type, commit=True):
  83. """
  84. 创建一个新的数据集条目,并复制对应的数据类型表的数据,但不立即提交事务。
  85. Args:
  86. session (Session): SQLAlchemy session对象。
  87. data_type (str): 数据集的类型。
  88. commit (bool): 是否在函数结束时提交事务。
  89. Returns:
  90. int: 新保存的数据集的ID。
  91. """
  92. new_dataset = Datasets(
  93. Dataset_name=f"{data_type}_dataset_{datetime.datetime.now():%Y%m%d_%H%M%S}",
  94. Dataset_description=f"Automatically generated dataset for type {data_type}",
  95. Row_count=0,
  96. Status='pending',
  97. Dataset_type=data_type
  98. )
  99. session.add(new_dataset)
  100. session.flush()
  101. dataset_id = new_dataset.Dataset_ID
  102. source_table = data_type_table_mapping(data_type)
  103. new_table_name = f"dataset_{dataset_id}"
  104. session.execute(text(f"CREATE TABLE {new_table_name} AS SELECT * FROM {source_table};"))
  105. session.execute(text(f"UPDATE datasets SET status='processed', row_count=(SELECT count(*) FROM {new_table_name}) WHERE dataset_id={dataset_id};"))
  106. if commit:
  107. session.commit()
  108. return dataset_id
  109. def data_type_table_mapping(data_type):
  110. """映射数据类型到对应的数据库表名"""
  111. if data_type == 'reduce':
  112. return 'current_reduce'
  113. elif data_type == 'reflux':
  114. return 'current_reflux'
  115. else:
  116. raise ValueError("Invalid data type provided.")
  117. def train_model_by_type(X, y, model_type):
  118. # 划分数据集
  119. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  120. if model_type == 'RandomForest':
  121. # 随机森林的参数优化
  122. return train_random_forest(X_train, y_train)
  123. elif model_type == 'XGBR':
  124. # XGBoost的参数优化
  125. return train_xgboost(X_train, y_train)
  126. elif model_type == 'GBSTR':
  127. # 梯度提升树的参数优化
  128. return train_gradient_boosting(X_train, y_train)
  129. else:
  130. raise ValueError(f"Unsupported model type: {model_type}")
  131. def train_random_forest(X_train, y_train):
  132. best_score = -float('inf')
  133. best_n_estimators = None
  134. best_max_depth = None
  135. random_state = 43
  136. # 筛选最佳的树的数量
  137. for n_estimators in range(1, 20, 1):
  138. model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
  139. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  140. if score > best_score:
  141. best_score = score
  142. best_n_estimators = n_estimators
  143. print(f"Best number of trees: {best_n_estimators}, Score: {best_score}")
  144. # 在找到的最佳树的数量基础上,筛选最佳的最大深度
  145. best_score = 0 # 重置最佳得分,为最大深度优化做准备
  146. for max_depth in range(1, 5, 1):
  147. model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=max_depth, random_state=random_state)
  148. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  149. if score > best_score:
  150. best_score = score
  151. best_max_depth = max_depth
  152. print(f"Best max depth: {best_max_depth}, Score: {best_score}")
  153. # 使用最佳的树的数量和最大深度训练最终模型
  154. best_model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth,
  155. random_state=random_state)
  156. # 传入列名进行训练
  157. best_model.fit(X_train, y_train)
  158. # 指定传入的特征名
  159. best_model.feature_names_in_ = X_train.columns
  160. return best_model
  161. def train_xgboost(X_train, y_train):
  162. best_score = -float('inf')
  163. best_params = {'learning_rate': None, 'max_depth': None}
  164. random_state = 43
  165. for learning_rate in [0.01, 0.05, 0.1, 0.2]:
  166. for max_depth in range(3, 10):
  167. model = XGBRegressor(learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
  168. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  169. if score > best_score:
  170. best_score = score
  171. best_params['learning_rate'] = learning_rate
  172. best_params['max_depth'] = max_depth
  173. print(f"Best parameters: {best_params}, Score: {best_score}")
  174. # 使用找到的最佳参数训练最终模型
  175. best_model = XGBRegressor(learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'],
  176. random_state=random_state)
  177. best_model.fit(X_train, y_train)
  178. return best_model
  179. def train_gradient_boosting(X_train, y_train):
  180. best_score = -float('inf')
  181. best_params = {'learning_rate': None, 'max_depth': None}
  182. random_state = 43
  183. for learning_rate in [0.01, 0.05, 0.1, 0.2]:
  184. for max_depth in range(3, 10):
  185. model = GradientBoostingRegressor(learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
  186. score = cross_val_score(model, X_train, y_train, cv=5).mean()
  187. if score > best_score:
  188. best_score = score
  189. best_params['learning_rate'] = learning_rate
  190. best_params['max_depth'] = max_depth
  191. print(f"Best parameters: {best_params}, Score: {best_score}")
  192. # 使用找到的最佳参数训练最终模型
  193. best_model = GradientBoostingRegressor(learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'],
  194. random_state=random_state)
  195. best_model.fit(X_train, y_train)
  196. return best_model
  197. def save_model(session, model, model_name, model_type, model_description, dataset_id, data_type, custom_path='pkl', commit=False):
  198. """
  199. 保存模型到数据库,并将模型文件保存到磁盘,但不立即提交事务。
  200. :param session: 数据库会话
  201. :param model: 要保存的模型对象
  202. :param model_name: 模型的名称
  203. :param model_type: 模型的类型
  204. :param model_description: 模型的描述信息
  205. :param dataset_id: 数据集ID
  206. :param custom_path: 保存模型的路径
  207. :param commit: 是否提交事务
  208. :return: 返回保存的模型文件路径
  209. """
  210. prefix_dict = {
  211. 'RandomForest': 'rf_model_',
  212. 'XGBRegressor': 'xgbr_model_',
  213. 'GBSTRegressor': 'gbstr_model_'
  214. }
  215. prefix = prefix_dict.get(model_type, 'default_model_')
  216. try:
  217. # 确保路径存在
  218. os.makedirs(custom_path, exist_ok=True)
  219. # 获取当前时间戳
  220. timestamp = datetime.datetime.now().strftime('%m%d_%H%M')
  221. # 拼接完整的文件名
  222. file_name = os.path.join(custom_path, f'{prefix}{timestamp}.pkl')
  223. # 保存模型到文件
  224. with open(file_name, 'wb') as f:
  225. pickle.dump(model, f)
  226. print(f"模型已保存为: {file_name}")
  227. # 创建模型数据库记录
  228. new_model = Models(
  229. Model_name=model_name,
  230. Model_type=model_type,
  231. Description=model_description,
  232. DatasetID=dataset_id,
  233. Created_at=datetime.datetime.now(),
  234. ModelFilePath=file_name,
  235. Data_type=data_type
  236. )
  237. # 添加记录到数据库
  238. session.add(new_model)
  239. session.flush()
  240. # 返回模型ID
  241. return new_model.ModelID
  242. except Exception as e:
  243. print(f"Error saving model: {str(e)}")
  244. raise
  245. if __name__ == '__main__':
  246. # 反酸模型预测
  247. # 测试 predict 函数
  248. input_data = pd.DataFrame([{
  249. "organic_matter": 5.2,
  250. "chloride": 3.1,
  251. "cec": 25.6,
  252. "h_concentration": 0.5,
  253. "hn": 12.4,
  254. "al_concentration": 0.8,
  255. "free_alumina": 1.2,
  256. "free_iron": 0.9,
  257. "delta_ph": -0.2
  258. }])
  259. model_name = 'RF_filt'
  260. Acid_reflux_result = predict(input_data, model_name)
  261. print("Acid_reflux_result:", Acid_reflux_result) # 预测结果
  262. # 降酸模型预测
  263. # 测试 predict 函数
  264. input_data = pd.DataFrame([{
  265. "pH": 5.2,
  266. "OM": 3.1,
  267. "CL": 25.6,
  268. "H": 0.5,
  269. "Al": 12.4
  270. }])
  271. model_name = 'rf_model_1214_1008'
  272. Acid_reduce_result = predict(input_data, model_name)
  273. print("Acid_reduce_result:", Acid_reduce_result) # 预测结果