learning_rate.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. from sklearn.model_selection import learning_curve
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from sklearn.ensemble import RandomForestRegressor
  5. import pandas as pd
  6. from sklearn.model_selection import train_test_split
  7. ## 精准降酸数据 54个样本 5个特征 1/b
  8. data=pd.read_excel('model_optimize\data\Acidity_reduce.xlsx')
  9. x = data.iloc[:,1:]
  10. y = data.iloc[:,0]
  11. # 为 x 赋予列名
  12. x.columns = [
  13. 'pH',
  14. 'OM',
  15. 'CL',
  16. 'H',
  17. 'Al'
  18. ]
  19. y.name = 'target'
  20. # ## 土壤反酸筛选数据 64个样本 9个特征(包含delta_ph) 105_day_ph
  21. # data=pd.read_excel('model_optimize\data\data_filt.xlsx')
  22. # x = data.iloc[:,1:10]
  23. # print(x)
  24. # y = data.iloc[:,-1]
  25. # print(y)
  26. # # 为 x 赋予列名
  27. # x.columns = [
  28. # 'organic_matter', # OM g/kg
  29. # 'chloride', # CL g/kg
  30. # 'cec', # CEC cmol/kg
  31. # 'h_concentration', # H+ cmol/kg
  32. # 'hn', # HN mg/kg
  33. # 'al_concentration', # Al3+ cmol/kg
  34. # 'free_alumina', # Free alumina g/kg
  35. # 'free_iron', # Free iron oxides g/kg
  36. # 'delta_ph' # ΔpH
  37. # ]
  38. # y.name = 'target_ph'
  39. # ## 土壤反酸筛选数据 64个样本 8个特征 delta_ph
  40. # # data=pd.read_excel('model_optimize\data\data_filt.xlsx') # 64个样本
  41. # data=pd.read_excel('model_optimize\data\data_filt - 副本.xlsx') # 60个样本(去除异常点)
  42. # x = data.iloc[:,1:9]
  43. # print(x)
  44. # y = data.iloc[:,-2]
  45. # print(y)
  46. # # 为 x 赋予列名
  47. # x.columns = [
  48. # 'organic_matter', # OM g/kg
  49. # 'chloride', # CL g/kg
  50. # 'cec', # CEC cmol/kg
  51. # 'h_concentration', # H+ cmol/kg
  52. # 'hn', # HN mg/kg
  53. # 'al_concentration', # Al3+ cmol/kg
  54. # 'free_alumina', # Free alumina g/kg
  55. # 'free_iron', # Free iron oxides g/kg
  56. # ]
  57. # y.name = 'target_ph'
  58. ## 数据集划分
  59. Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.2, random_state=42)
  60. # 模型:使用 RandomForestRegressor 举例
  61. rfc = RandomForestRegressor(random_state=1)
  62. # 计算学习曲线
  63. train_sizes, train_scores, test_scores = learning_curve(
  64. rfc, # 使用的模型
  65. Xtrain, # 训练特征
  66. Ytrain, # 训练目标
  67. cv=5, # 交叉验证折数
  68. n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
  69. train_sizes=np.linspace(0.1, 1.0, 10) # 训练集大小,从10%到100%,共10个点
  70. )
  71. # 获取 test_scores(交叉验证测试集的得分)
  72. print("test_scores: \n", test_scores)
  73. print("train_scores: \n", train_scores)
  74. import matplotlib.pyplot as plt
  75. import numpy as np
  76. # 绘制学习曲线
  77. plt.figure(figsize=(8, 6))
  78. # 绘制训练误差和测试误差
  79. plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training score", color="r")
  80. plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Cross-validation score", color="g")
  81. # 绘制图形的细节
  82. plt.title("Learning Curve (Random Forest Regressor)")
  83. plt.xlabel("Training Size (%)")
  84. plt.ylabel("Score (R²)")
  85. plt.legend(loc="best")
  86. plt.grid(True)
  87. plt.show()