Bladeren bron

Merge branch 'master' of http://139.9.51.218:3000/Ding/AcidificationModel

# Conflicts:
#	api/SoilAcidification.db
DIng 2 maanden geleden
bovenliggende
commit
3bafd3e3ec

+ 2 - 1
api/.gitignore

@@ -1,4 +1,5 @@
 app/__pycache__
 .idea
 model_optimize/__pycache__
-__pycache__
+__pycache__
+.vscode

BIN
api/SoilAcidification.db


BIN
api/__pycache__/run.cpython-38.pyc


+ 116 - 0
api/app/data_cleaner.py

@@ -0,0 +1,116 @@
+"""
+数据清理模块,提供各种数据清理和预处理功能
+"""
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+import logging
+
+logger = logging.getLogger(__name__)
+
+def remove_duplicates(df):
+    """
+    移除数据框中的重复行
+    
+    Args:
+        df: 输入数据框
+        
+    Returns:
+        tuple: (清理后的数据框, 移除的重复项数量)
+    """
+    original_count = len(df)
+    df_clean = df.drop_duplicates()
+    duplicates_removed = original_count - len(df_clean)
+    logger.info(f"移除了 {duplicates_removed} 个重复样本")
+    return df_clean, duplicates_removed
+
+def remove_outliers(df, method='iqr', threshold=1.5):
+    """
+    使用指定方法检测和移除异常值
+    
+    Args:
+        df: 输入数据框
+        method: 异常值检测方法 ('iqr', 'zscore')
+        threshold: 异常值判定阈值
+        
+    Returns:
+        tuple: (清理后的数据框, 移除的异常值数量)
+    """
+    original_count = len(df)
+    
+    if method == 'iqr':
+        Q1 = df.quantile(0.25)
+        Q3 = df.quantile(0.75)
+        IQR = Q3 - Q1
+        outlier_mask = ~((df < (Q1 - threshold * IQR)) | (df > (Q3 + threshold * IQR))).any(axis=1)
+        df_clean = df[outlier_mask]
+    
+    elif method == 'zscore':
+        from scipy import stats
+        z_scores = stats.zscore(df)
+        outlier_mask = ~(np.abs(z_scores) > threshold).any(axis=1)
+        df_clean = df[outlier_mask]
+    
+    outliers_removed = original_count - len(df_clean)
+    logger.info(f"使用 {method} 方法移除了 {outliers_removed} 个异常值")
+    return df_clean, outliers_removed
+
+def clean_dataset(df, target_column=None, remove_dups=False, handle_outliers=False, 
+                 outlier_method='iqr', outlier_threshold=1.5, normalize=False):
+    """
+    综合数据清理函数
+    
+    Args:
+        df: 输入数据框
+        target_column: 目标变量列名或索引
+        remove_dups: 是否移除重复项
+        handle_outliers: 是否处理异常值
+        outlier_method: 异常值检测方法
+        outlier_threshold: 异常值判定阈值
+        normalize: 是否标准化特征
+        
+    Returns:
+        tuple: (特征数据框, 目标变量, 清理统计信息)
+    """
+    stats = {'original_count': len(df)}
+    
+    # 分离特征和目标变量
+    if target_column is not None:
+        if isinstance(target_column, str):
+            X = df.drop(columns=[target_column])
+            y = df[target_column]
+        else:
+            X = df.drop(df.columns[target_column], axis=1)
+            y = df.iloc[:, target_column]
+    else:
+        X = df
+        y = None
+    
+    # 移除重复项
+    if remove_dups:
+        if y is not None:
+            combined = pd.concat([X, y], axis=1)
+            combined, stats['duplicates_removed'] = remove_duplicates(combined)
+            X = combined.iloc[:, :-1] if isinstance(target_column, int) else combined.drop(columns=[target_column])
+            y = combined.iloc[:, -1] if isinstance(target_column, int) else combined[target_column]
+        else:
+            X, stats['duplicates_removed'] = remove_duplicates(X)
+    
+    # 处理异常值
+    if handle_outliers:
+        if y is not None:
+            combined = pd.concat([X, y], axis=1)
+            combined, stats['outliers_removed'] = remove_outliers(combined, method=outlier_method, threshold=outlier_threshold)
+            X = combined.iloc[:, :-1] if isinstance(target_column, int) else combined.drop(columns=[target_column])
+            y = combined.iloc[:, -1] if isinstance(target_column, int) else combined[target_column]
+        else:
+            X, stats['outliers_removed'] = remove_outliers(X, method=outlier_method, threshold=outlier_threshold)
+    
+    # 标准化特征
+    if normalize:
+        scaler = StandardScaler()
+        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
+        stats['normalized'] = True
+    
+    stats['final_count'] = len(X)
+    return X, y, stats 

+ 6 - 1
api/app/database_models.py

@@ -31,7 +31,12 @@ class Models(Base):
     DatasetID: Mapped[Optional[int]] = mapped_column(Integer, ForeignKey('Datasets.Dataset_ID'))
     ModelFilePath: Mapped[Optional[str]] = mapped_column(Text)
     Data_type: Mapped[Optional[str]] = mapped_column(Text)
-    Performance_score: Mapped[Optional[float]] = mapped_column(Text)
+    Performance_score: Mapped[Optional[float]] = mapped_column(Float)
+
+    # 新增评分指标字段
+    MAE: Mapped[Optional[float]] = mapped_column(Float)
+    RMSE: Mapped[Optional[float]] = mapped_column(Float)
+    CV_score: Mapped[Optional[float]] = mapped_column(Float)
 
     ModelParameters: Mapped[List['ModelParameters']] = relationship('ModelParameters', back_populates='Models_')
 

+ 138 - 16
api/app/model.py

@@ -4,13 +4,16 @@ import pickle
 import pandas as pd
 from flask_sqlalchemy.session import Session
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
-from sklearn.metrics import r2_score
+from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
 from sklearn.model_selection import train_test_split, cross_val_score
 from sqlalchemy import text
 from xgboost import XGBRegressor
+import logging
+import numpy as np
 
 from .database_models import Models, Datasets
 from .config import Config
+from .data_cleaner import clean_dataset
 
 
 # 加载模型
@@ -31,8 +34,72 @@ def predict(session, input_data: pd.DataFrame, model_id):
     predictions = ML_model.predict(input_data)
     return predictions.tolist()
 
+def check_dataset_overlap_with_test(dataset_df, data_type):
+    """
+    检查数据集是否与测试集有重叠
+    
+    Args:
+        dataset_df (DataFrame): 要检查的数据集
+        data_type (str): 数据集类型 ('reflux' 或 'reduce')
+        
+    Returns:
+        tuple: (重叠的行数, 重叠的行索引)
+    """
+    # 加载测试集
+    if data_type == 'reflux':
+        X_test = pd.read_csv('uploads/data/X_test_reflux.csv')
+        Y_test = pd.read_csv('uploads/data/Y_test_reflux.csv')
+    elif data_type == 'reduce':
+        X_test = pd.read_csv('uploads/data/X_test_reduce.csv')
+        Y_test = pd.read_csv('uploads/data/Y_test_reduce.csv')
+    else:
+        raise ValueError(f"不支持的数据类型: {data_type}")
+    
+    # 合并X_test和Y_test
+    if data_type == 'reflux':
+        test_df = pd.concat([X_test, Y_test], axis=1)
+    else:
+        test_df = pd.concat([X_test, Y_test], axis=1)
+    
+    # 确定用于比较的列
+    compare_columns = [col for col in dataset_df.columns if col in test_df.columns]
+    
+    if not compare_columns:
+        return 0, []
+    
+    # 查找重叠的行
+    merged = dataset_df[compare_columns].merge(test_df[compare_columns], how='inner', indicator=True)
+    overlapping_rows = merged[merged['_merge'] == 'both']
+    
+    # 获取重叠行在原始数据集中的索引
+    if not overlapping_rows.empty:
+        # 使用合并后的数据找回原始索引
+        overlap_indices = []
+        for _, row in overlapping_rows.iterrows():
+            # 创建一个布尔掩码,用于在原始数据集中查找匹配的行
+            mask = True
+            for col in compare_columns:
+                mask = mask & (dataset_df[col] == row[col])
+            
+            # 获取匹配行的索引
+            matching_indices = dataset_df[mask].index.tolist()
+            overlap_indices.extend(matching_indices)
+        
+        return len(set(overlap_indices)), list(set(overlap_indices))
+    
+    return 0, []
+
 # 计算模型评分
 def calculate_model_score(model_info):
+    """
+    计算模型评分
+    
+    Args:
+        model_info: 模型信息对象
+        
+    Returns:
+        dict: 包含多种评分指标的字典
+    """
     # 加载模型
     with open(model_info.ModelFilePath, 'rb') as f:
         ML_model = pickle.load(f)
@@ -42,22 +109,55 @@ def calculate_model_score(model_info):
         # 加载保存的 X_test 和 Y_test
         X_test = pd.read_csv('uploads/data/X_test_reflux.csv')
         Y_test = pd.read_csv('uploads/data/Y_test_reflux.csv')
-        print(X_test.columns)  # 在测试时使用的数据的列名
+        
+        # 预测测试集
         y_pred = ML_model.predict(X_test)
+        
+        # 计算各种评分指标
+        r2 = r2_score(Y_test, y_pred)
+        mae = mean_absolute_error(Y_test, y_pred)
+        rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
+        
     elif model_info.Data_type == 'reduce':  # 降酸数据集
         # 加载保存的 X_test 和 Y_test
         X_test = pd.read_csv('uploads/data/X_test_reduce.csv')
         Y_test = pd.read_csv('uploads/data/Y_test_reduce.csv')
-        print(X_test.columns)  # 在测试时使用的数据的列名
+        
+        # 预测测试集
         y_pred = ML_model.predict(X_test)
-
-
-    # 计算 R² 分数
-    r2 = r2_score(Y_test, y_pred)
-    return r2
+        
+        # 计算各种评分指标
+        r2 = r2_score(Y_test, y_pred)
+        mae = mean_absolute_error(Y_test, y_pred)
+        rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
+        
+    else:
+        # 不支持的数据类型
+        return {'r2': 0, 'mae': 0, 'rmse': 0}
+    
+    # 返回所有评分指标(不包括交叉验证得分)
+    return {
+        'r2': float(r2),
+        'mae': float(mae),
+        'rmse': float(rmse)
+    }
 
 
 def train_and_save_model(session, model_type, model_name, model_description, data_type, dataset_id=None):
+    """
+    训练并保存模型
+    
+    Args:
+        session: 数据库会话
+        model_type: 模型类型
+        model_name: 模型名称
+        model_description: 模型描述
+        data_type: 数据类型 ('reflux' 或 'reduce')
+        dataset_id: 数据集ID
+        
+    Returns:
+        tuple: (模型名称, 模型ID, 数据集ID)
+    """
     try:
         if not dataset_id:
             # 创建新的数据集并复制数据,此过程将不立即提交
@@ -79,26 +179,45 @@ def train_and_save_model(session, model_type, model_name, model_description, dat
             if dataset.empty:
                 raise ValueError(f"Dataset {dataset_id} is empty or not found.")
 
+        # 使用数据清理模块
         if data_type == 'reflux':
             X = dataset.iloc[:, 1:-1]
             y = dataset.iloc[:, -1]
+
+            # target_column = -1  # 假设目标变量在最后一列
+            # X, y, clean_stats = clean_dataset(dataset, target_column=target_column)
         elif data_type == 'reduce':
             X = dataset.iloc[:, 2:]
             y = dataset.iloc[:, 1]
-
+            # target_column = 1  # 假设目标变量在第二列
+            # X, y, clean_stats = clean_dataset(dataset, target_column=target_column)
+        
+        # 记录清理统计信息
+        # logging.info(f"数据清理统计: {clean_stats}")
+        
         # 训练模型
         model = train_model_by_type(X, y, model_type)
-
+        
+        # 计算交叉验证得分
+        cv_score = cross_val_score(model, X, y, cv=5).mean()
+        
         # 保存模型到数据库
         model_id = save_model(session, model, model_name, model_type, model_description, dataset_id, data_type)
-
+        
+        # 更新模型的交叉验证得分
+        model_info = session.query(Models).filter(Models.ModelID == model_id).first()
+        if model_info:
+            model_info.CV_score = float(cv_score)
+            session.commit()
+        
         # 所有操作成功后,手动提交事务
         session.commit()
-        return model_name, model_id, dataset_id
+        return model_name, model_id, dataset_id, cv_score
+        
     except Exception as e:
-        # 如果在任何阶段出现异常,回滚事务
         session.rollback()
-        raise e  # 可选择重新抛出异常或处理异常
+        logging.error(f"训练和保存模型时发生错误: {str(e)}", exc_info=True)
+        raise
 
 
 
@@ -152,8 +271,11 @@ def data_type_table_mapping(data_type):
 
 def train_model_by_type(X, y, model_type):
     # 划分数据集
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
+    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    
+    # 使用全部数据作为训练集
+    X_train, y_train = X, y
+    
     if model_type == 'RandomForest':
         # 随机森林的参数优化
         return train_random_forest(X_train, y_train)

+ 90 - 13
api/app/routes.py

@@ -6,7 +6,7 @@ from flask import Blueprint, request, jsonify, current_app, send_file, session
 from werkzeug.security import check_password_hash, generate_password_hash
 from werkzeug.utils import secure_filename, send_from_directory
 
-from .model import predict, train_and_save_model, calculate_model_score
+from .model import predict, train_and_save_model, calculate_model_score, check_dataset_overlap_with_test
 import pandas as pd
 from . import db  # 从 app 包导入 db 实例
 from sqlalchemy.engine.reflection import Inspector
@@ -138,6 +138,49 @@ def upload_dataset():
         dynamic_table_class = create_dynamic_table(new_dataset.Dataset_ID, column_types)
         insert_data_into_dynamic_table(session, dataset_df, dynamic_table_class)
 
+        # 去除上传数据集内部的重复项
+        original_count = len(dataset_df)
+        dataset_df = dataset_df.drop_duplicates()
+        duplicates_in_file = original_count - len(dataset_df)
+
+        # 检查与现有数据的重复
+        duplicates_with_existing = 0
+        if dataset_type in ['reduce', 'reflux']:
+            # 确定表名
+            table_name = 'current_reduce' if dataset_type == 'reduce' else 'current_reflux'
+            
+            # 从表加载现有数据
+            existing_data = pd.read_sql_table(table_name, session.bind)
+            if 'id' in existing_data.columns:
+                existing_data = existing_data.drop('id', axis=1)
+            
+            # 确定用于比较的列
+            compare_columns = [col for col in dataset_df.columns if col in existing_data.columns]
+            
+            # 计算重复行数
+            original_df_len = len(dataset_df)
+            
+            # 使用concat和drop_duplicates找出非重复行
+            all_data = pd.concat([existing_data[compare_columns], dataset_df[compare_columns]])
+            duplicates_mask = all_data.duplicated(keep='first')
+            duplicates_with_existing = sum(duplicates_mask[len(existing_data):])
+            
+            # 保留非重复行
+            dataset_df = dataset_df[~duplicates_mask[len(existing_data):].values]
+            
+            logger.info(f"原始数据: {original_df_len}, 与现有数据重复: {duplicates_with_existing}, 保留: {len(dataset_df)}")
+
+        # 检查与测试集的重叠
+        test_overlap_count, test_overlap_indices = check_dataset_overlap_with_test(dataset_df, dataset_type)
+        
+        # 如果有与测试集重叠的数据,从数据集中移除
+        if test_overlap_count > 0:
+            # 创建一个布尔掩码,标记不在重叠索引中的行
+            mask = ~dataset_df.index.isin(test_overlap_indices)
+            # 应用掩码,只保留不重叠的行
+            dataset_df = dataset_df[mask]
+            logger.warning(f"移除了 {test_overlap_count} 行与测试集重叠的数据")
+
         # 根据 dataset_type 决定插入到哪个已有表
         if dataset_type == 'reduce':
             insert_data_into_existing_table(session, dataset_df, CurrentReduce)
@@ -150,15 +193,30 @@ def upload_dataset():
         training_triggered, task_id = check_and_trigger_training(session, dataset_type, dataset_df)
 
         response_data = {
-            'message': f'Dataset {dataset_name} uploaded successfully!',
+            'message': f'数据集 {dataset_name} 上传成功!',
             'dataset_id': new_dataset.Dataset_ID,
             'filename': unique_filename,
-            'training_triggered': training_triggered
+            'training_triggered': training_triggered,
+            'data_stats': {
+                'original_count': original_count,
+                'duplicates_in_file': duplicates_in_file,
+                'duplicates_with_existing': duplicates_with_existing,
+                'test_overlap_count': test_overlap_count,
+                'final_count': len(dataset_df)
+            }
         }
         
         if training_triggered:
             response_data['task_id'] = task_id
-            response_data['message'] += ' Auto-training has been triggered.'
+            response_data['message'] += ' 自动训练已触发。'
+
+        # 添加去重信息到消息中
+        if duplicates_with_existing > 0:
+            response_data['message'] += f' 已移除 {duplicates_with_existing} 个与现有数据重复的项。'
+            
+        # 添加测试集重叠信息到消息中
+        if test_overlap_count > 0:
+            response_data['message'] += f' 已移除 {test_overlap_count} 个与测试集重叠的项。'
 
         return jsonify(response_data), 201
 
@@ -197,11 +255,22 @@ def train_and_save_model_endpoint():
         if model_id:
             model_info = session.query(Models).filter(Models.ModelID == model_id).first()
             if model_info:
-                score = calculate_model_score(model_info)
+                # 计算多种评分指标
+                score_metrics = calculate_model_score(model_info)
                 # 更新模型评分
-                model_info.Performance_score = score
+                model_info.Performance_score = score_metrics['r2']
+                # 添加新的评分指标到数据库
+                model_info.MAE = score_metrics['mae']
+                model_info.RMSE = score_metrics['rmse']
+                # CV_score 已在 train_and_save_model 中设置,此处不再更新
                 session.commit()
-                result = {'model_id': model_id, 'model_score': score}
+                result = {
+                    'model_id': model_id, 
+                    'model_score': score_metrics['r2'],
+                    'mae': score_metrics['mae'],
+                    'rmse': score_metrics['rmse'],
+                    'cv_score': result[3]
+                }
 
         # 返回成功响应
         return jsonify({
@@ -263,7 +332,7 @@ def predict_route():
         return jsonify({'error': str(e)}), 400
 
 
-# 为指定模型计算评分Performance_score,需要提供model_id
+# 为指定模型计算指标评分,需要提供model_id
 @bp.route('/score-model/<int:model_id>', methods=['POST'])
 def score_model(model_id):
     # 创建 sessionmaker 实例
@@ -275,15 +344,23 @@ def score_model(model_id):
             return jsonify({'error': 'Model not found'}), 404
 
         # 计算模型评分
-        score = calculate_model_score(model_info)
+        score_metrics = calculate_model_score(model_info)
+
+        # 更新模型记录中的评分(不包括交叉验证得分)
+        model_info.Performance_score = score_metrics['r2']
+        model_info.MAE = score_metrics['mae']
+        model_info.RMSE = score_metrics['rmse']
 
-        # 更新模型记录中的评分
-        model_info.Performance_score = score
         session.commit()
 
-        return jsonify({'message': 'Model scored successfully', 'score': score}), 200
+        return jsonify({
+            'message': 'Model scored successfully', 
+            'r2_score': score_metrics['r2'],
+            'mae': score_metrics['mae'],
+            'rmse': score_metrics['rmse'],
+        }), 200
     except Exception as e:
-        logging.error('Failed to process the dataset upload:', exc_info=True)
+        logging.error('Failed to process model scoring:', exc_info=True)
         return jsonify({'error': str(e)}), 400
     finally:
         session.close()

BIN
api/model_optimize/data/Acidity_reduce_new - 1.xlsx


BIN
api/model_optimize/data/Acidity_reduce_new - 2.xlsx


BIN
api/model_optimize/data/Acidity_reduce_new - 3.xlsx


BIN
api/pkl/rf_model_0104_0932.pkl


BIN
api/pkl/rf_model_0107_0123.pkl


BIN
api/pkl/rf_model_0111_1755.pkl


BIN
api/pkl/rf_model_0308_1550.pkl


BIN
api/pkl/rf_model_0308_1619.pkl


BIN
api/pkl/rf_model_0308_1632.pkl


BIN
api/uploads/datasets/dataset_8.xlsx