model_compare.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import joblib
  5. from sklearn.metrics import mean_squared_error, r2_score
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. from datetime import datetime
  9. from sklearn.ensemble import RandomForestRegressor
  10. from sklearn.model_selection import train_test_split
  11. def load_models(model_dir):
  12. """
  13. 加载指定目录中的所有模型
  14. @param {string} model_dir - 模型存储目录路径
  15. @return {dict} - 模型名称和模型对象的字典
  16. """
  17. models = {}
  18. for filename in os.listdir(model_dir):
  19. if filename.endswith('.pkl'):
  20. model_path = os.path.join(model_dir, filename)
  21. model_name = os.path.splitext(filename)[0]
  22. try:
  23. model = joblib.load(model_path)
  24. models[model_name] = model
  25. print(f"成功加载模型: {model_name}")
  26. except Exception as e:
  27. print(f"加载模型 {model_name} 时出错: {str(e)}")
  28. return models
  29. def evaluate_models(models, X_test, y_test):
  30. """
  31. 评估所有模型的性能
  32. @param {dict} models - 模型名称和模型对象的字典
  33. @param {DataFrame} X_test - 测试特征数据
  34. @param {Series} y_test - 测试目标数据
  35. @return {DataFrame} - 包含各模型评估指标的数据框
  36. """
  37. results = []
  38. for name, model in models.items():
  39. try:
  40. # 预测
  41. y_pred = model.predict(X_test)
  42. # 计算评估指标
  43. rmse = mean_squared_error(y_test, y_pred, squared=False)
  44. r2 = r2_score(y_test, y_pred)
  45. # 存储结果
  46. results.append({
  47. 'model_name': name,
  48. 'rmse': rmse,
  49. 'r2': r2
  50. })
  51. print(f"模型 {name} - RMSE: {rmse:.4f}, R²: {r2:.4f}")
  52. except Exception as e:
  53. print(f"评估模型 {name} 时出错: {str(e)}")
  54. # 转换为DataFrame并排序
  55. results_df = pd.DataFrame(results)
  56. return results_df
  57. def select_best_model(results_df, metric='r2', higher_better=True):
  58. """
  59. 根据指定指标选择最佳模型
  60. @param {DataFrame} results_df - 包含各模型评估指标的数据框
  61. @param {string} metric - 用于选择的指标名称
  62. @param {boolean} higher_better - 指标值是否越高越好
  63. @return {string} - 最佳模型名称
  64. """
  65. if higher_better:
  66. best_idx = results_df[metric].idxmax()
  67. else:
  68. best_idx = results_df[metric].idxmin()
  69. best_model = results_df.loc[best_idx, 'model_name']
  70. print(f"根据 {metric} 指标,最佳模型是: {best_model}")
  71. return best_model
  72. def visualize_results(results_df):
  73. """
  74. 可视化各模型的性能比较
  75. @param {DataFrame} results_df - 包含各模型评估指标的数据框
  76. """
  77. # 设置字体,使用通用字体
  78. plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial']
  79. plt.rcParams['axes.unicode_minus'] = False
  80. # 创建图形
  81. fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
  82. # RMSE比较图
  83. sns.barplot(x='model_name', y='rmse', data=results_df, ax=ax1)
  84. ax1.set_title('RMSE Comparison of Models')
  85. ax1.set_xlabel('Model Name')
  86. ax1.set_ylabel('RMSE (Lower is better)')
  87. ax1.tick_params(axis='x', rotation=45)
  88. # R²比较图
  89. sns.barplot(x='model_name', y='r2', data=results_df, ax=ax2)
  90. ax2.set_title('R² Comparison of Models')
  91. ax2.set_xlabel('Model Name')
  92. ax2.set_ylabel('R² (Higher is better)')
  93. ax2.tick_params(axis='x', rotation=45)
  94. plt.tight_layout()
  95. # 保存图表
  96. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  97. plt.savefig(f'model_optimize/results/model_comparison_{timestamp}.png', dpi=300)
  98. plt.show()
  99. def save_best_model(models, best_model_name, output_dir):
  100. """
  101. 将最佳模型保存到指定目录
  102. @param {dict} models - 模型名称和模型对象的字典
  103. @param {string} best_model_name - 最佳模型名称
  104. @param {string} output_dir - 输出目录
  105. """
  106. if not os.path.exists(output_dir):
  107. os.makedirs(output_dir)
  108. best_model = models[best_model_name]
  109. output_path = os.path.join(output_dir, 'best_model.pkl')
  110. joblib.dump(best_model, output_path)
  111. print(f"最佳模型已保存至: {output_path}")
  112. def extract_and_retrain_model(model_path, X_train, y_train, X_test, y_test):
  113. """
  114. 从现有模型中提取参数,使用这些参数在训练集上重新训练模型,然后在测试集上评估
  115. @param {string} model_path - 模型文件路径
  116. @param {DataFrame} X_train - 训练特征数据
  117. @param {Series} y_train - 训练目标数据
  118. @param {DataFrame} X_test - 测试特征数据
  119. @param {Series} y_test - 测试目标数据
  120. @return {dict} - 包含原始模型和重训练模型评估结果的字典
  121. """
  122. try:
  123. # 加载原始模型
  124. original_model = joblib.load(model_path)
  125. model_name = os.path.basename(model_path)
  126. print(f"成功加载模型: {model_name}")
  127. # 提取模型参数
  128. params = original_model.get_params()
  129. print(f"提取的模型参数: {params}")
  130. # 使用原始模型直接在测试集上评估
  131. y_pred_original = original_model.predict(X_test)
  132. rmse_original = mean_squared_error(y_test, y_pred_original, squared=False)
  133. r2_original = r2_score(y_test, y_pred_original)
  134. # 使用提取的参数创建新模型并在训练集上训练
  135. new_model = RandomForestRegressor(**params)
  136. new_model.fit(X_train, y_train)
  137. # 在测试集上评估新训练的模型
  138. y_pred_new = new_model.predict(X_test)
  139. rmse_new = mean_squared_error(y_test, y_pred_new, squared=False)
  140. r2_new = r2_score(y_test, y_pred_new)
  141. # 返回结果
  142. results = {
  143. 'model_name': model_name,
  144. 'original': {
  145. 'rmse': rmse_original,
  146. 'r2': r2_original
  147. },
  148. 'retrained': {
  149. 'rmse': rmse_new,
  150. 'r2': r2_new,
  151. 'model': new_model
  152. },
  153. 'parameters': params
  154. }
  155. print(f"原始模型 {model_name} - RMSE: {rmse_original:.4f}, R²: {r2_original:.4f}")
  156. print(f"重训练模型 {model_name} - RMSE: {rmse_new:.4f}, R²: {r2_new:.4f}")
  157. return results
  158. except Exception as e:
  159. print(f"处理模型时出错: {str(e)}")
  160. return None
  161. def visualize_comparison(original_results, retrained_results):
  162. """
  163. 可视化原始模型和重训练模型的性能比较
  164. @param {dict} original_results - 原始模型的评估结果
  165. @param {dict} retrained_results - 重训练模型的评估结果
  166. """
  167. # 设置字体,使用通用字体
  168. plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial']
  169. plt.rcParams['axes.unicode_minus'] = False
  170. # 创建图形
  171. fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
  172. # 准备数据
  173. model_names = ['Original Model', 'Retrained Model']
  174. rmse_values = [original_results['rmse'], retrained_results['rmse']]
  175. r2_values = [original_results['r2'], retrained_results['r2']]
  176. # RMSE比较图
  177. ax1.bar(model_names, rmse_values, color=['blue', 'orange'])
  178. ax1.set_title('RMSE Comparison')
  179. ax1.set_ylabel('RMSE (Lower is better)')
  180. # 在柱状图上添加数值标签
  181. for i, v in enumerate(rmse_values):
  182. ax1.text(i, v + 0.01, f'{v:.4f}', ha='center')
  183. # R²比较图
  184. ax2.bar(model_names, r2_values, color=['blue', 'orange'])
  185. ax2.set_title('R² Comparison')
  186. ax2.set_ylabel('R² (Higher is better)')
  187. # 在柱状图上添加数值标签
  188. for i, v in enumerate(r2_values):
  189. ax2.text(i, v + 0.01, f'{v:.4f}', ha='center')
  190. plt.tight_layout()
  191. # 保存图表
  192. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  193. plt.savefig(f'model_optimize/results/model_retrain_comparison_{timestamp}.png', dpi=300)
  194. plt.show()
  195. if __name__ == "__main__":
  196. # 加载数据
  197. data = pd.read_excel('model_optimize/data/Acidity_reduce_new.xlsx')
  198. X = data.iloc[:, 1:]
  199. y = data.iloc[:, 0]
  200. X.columns = ['pH', 'OM', 'CL', 'H', 'Al']
  201. y.name = 'target'
  202. # 划分训练集和测试集
  203. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  204. # 确保结果目录存在
  205. if not os.path.exists('model_optimize/results'):
  206. os.makedirs('model_optimize/results')
  207. # # 第一部分:评估特定模型并重新训练
  208. # specific_model_path = r'pkl\rf_model_0308_1619.pkl'
  209. # if os.path.exists(specific_model_path):
  210. # print("\n===== 特定模型参数提取与重训练评估 =====")
  211. # # 提取参数并重新训练
  212. # results = extract_and_retrain_model(specific_model_path, X_train, y_train, X_test, y_test)
  213. # if results:
  214. # # 可视化比较结果
  215. # visualize_comparison(results['original'], results['retrained'])
  216. # # 保存重训练的模型
  217. # retrained_model = results['retrained']['model']
  218. # output_dir = 'model_optimize/retrained_models'
  219. # if not os.path.exists(output_dir):
  220. # os.makedirs(output_dir)
  221. # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  222. # output_path = os.path.join(output_dir, f'retrained_model_{timestamp}.pkl')
  223. # joblib.dump(retrained_model, output_path)
  224. # print(f"重训练模型已保存至: {output_path}")
  225. # # 保存模型参数到文本文件
  226. # params_output = os.path.join(output_dir, f'model_parameters_{timestamp}.txt')
  227. # with open(params_output, 'w') as f:
  228. # for param, value in results['parameters'].items():
  229. # f.write(f"{param}: {value}\n")
  230. # print(f"模型参数已保存至: {params_output}")
  231. # else:
  232. # print(f"指定的模型文件不存在: {specific_model_path}")
  233. # 第二部分:原有的模型比较代码
  234. print("\n===== 所有模型性能比较 =====")
  235. # 加载所有模型
  236. models = load_models('model_optimize/pkl')
  237. if models:
  238. # 评估模型
  239. results_df = evaluate_models(models, X, y)
  240. # 保存评估结果
  241. # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  242. # results_df.to_csv(f'model_optimize/results/model_comparison_{timestamp}.csv', index=False)
  243. # 选择最佳模型 (基于R²)
  244. best_model_r2 = select_best_model(results_df, metric='r2', higher_better=True)
  245. # 选择最佳模型 (基于RMSE)
  246. best_model_rmse = select_best_model(results_df, metric='rmse', higher_better=False)
  247. print(f"基于R²的最佳模型: {best_model_r2}")
  248. print(f"基于RMSE的最佳模型: {best_model_rmse}")
  249. # 可视化结果
  250. visualize_results(results_df)
  251. # 保存最佳模型 (这里使用R²作为选择标准)
  252. # save_best_model(models, best_model_r2, 'model_optimize/best_model')
  253. else:
  254. print("没有找到可用的模型")