Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from pathlib import Path | |
| from deepeval.dataset import EvaluationDataset | |
| from deepeval.test_case import LLMTestCase | |
| import logging | |
| from typing import List, Optional | |
| logger = logging.getLogger(__name__) | |
| class DatasetLoader: | |
| """Handle loading and processing of evaluation datasets.""" | |
| def __init__(self): | |
| self.dataset = EvaluationDataset() | |
| def load_from_csv(self, | |
| file_path: str, | |
| input_col: str = "input", | |
| output_col: str = "expected_output", | |
| context_col: Optional[str] = None) -> EvaluationDataset: | |
| """ | |
| Load dataset from CSV file with comprehensive logging. | |
| Args: | |
| file_path: Path to the CSV file | |
| input_col: Column name for input questions | |
| output_col: Column name for expected outputs | |
| context_col: Optional column name for context | |
| Returns: | |
| EvaluationDataset: Loaded dataset | |
| """ | |
| try: | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| logger.error(f"Dataset file not found: {file_path}") | |
| raise FileNotFoundError(f"Dataset file not found: {file_path}") | |
| logger.info(f"Loading dataset from: {file_path}") | |
| # Read CSV file | |
| df = pd.read_csv(file_path) | |
| logger.info(f"CSV file loaded successfully. Shape: {df.shape}") | |
| # Validate required columns | |
| required_cols = [input_col, output_col] | |
| missing_cols = [col for col in required_cols if col not in df.columns] | |
| if missing_cols: | |
| logger.error(f"Missing required columns: {missing_cols}") | |
| logger.error(f"Available columns: {list(df.columns)}") | |
| raise ValueError(f"Missing required columns: {missing_cols}") | |
| # Log column information | |
| logger.info(f"Dataset columns: {list(df.columns)}") | |
| logger.info(f"Input column: {input_col}") | |
| logger.info(f"Output column: {output_col}") | |
| if context_col: | |
| logger.info(f"Context column: {context_col}") | |
| # Clean and validate data | |
| df = self._clean_data(df, input_col, output_col) | |
| # Load test cases | |
| self.dataset.add_test_cases_from_csv_file( | |
| file_path=str(file_path), | |
| input_col_name=input_col, | |
| actual_output_col_name=output_col, | |
| ) | |
| logger.info(f"Successfully loaded {len(self.dataset.test_cases)} test cases") | |
| # Log sample data | |
| self._log_sample_data(df, input_col, output_col) | |
| return self.dataset | |
| except Exception as e: | |
| logger.error(f"Error loading dataset: {e}") | |
| raise | |
| def _clean_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> pd.DataFrame: | |
| """Clean and validate dataset.""" | |
| logger.info("Cleaning dataset...") | |
| initial_count = len(df) | |
| # Remove rows with missing values in required columns | |
| df = df.dropna(subset=[input_col, output_col]) | |
| # Remove empty strings | |
| df = df[df[input_col].str.strip() != ''] | |
| df = df[df[output_col].str.strip() != ''] | |
| final_count = len(df) | |
| removed_count = initial_count - final_count | |
| if removed_count > 0: | |
| logger.warning(f"Removed {removed_count} invalid rows during cleaning") | |
| logger.info(f"Dataset cleaned. Final count: {final_count} rows") | |
| return df | |
| def _log_sample_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> None: | |
| """Log sample data for verification.""" | |
| logger.info("Sample data from dataset:") | |
| for i, row in df.head(3).iterrows(): | |
| logger.info(f"Sample {i+1}:") | |
| logger.info(f" Input: {row[input_col][:100]}...") | |
| logger.info(f" Output: {row[output_col][:100]}...") | |
| def get_dataset_stats(self) -> dict: | |
| """Get dataset statistics.""" | |
| if not self.dataset.test_cases: | |
| return {"total_cases": 0} | |
| stats = { | |
| "total_cases": len(self.dataset.test_cases), | |
| "avg_input_length": sum(len(case.input) for case in self.dataset.test_cases) / len(self.dataset.test_cases), | |
| "avg_output_length": sum(len(case.actual_output or "") for case in self.dataset.test_cases) / len(self.dataset.test_cases) | |
| } | |
| logger.info(f"Dataset statistics: {stats}") | |
| return stats |