A7m0d's picture
Upload folder using huggingface_hub
7dfe46c verified
import pandas as pd
from pathlib import Path
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase
import logging
from typing import List, Optional
logger = logging.getLogger(__name__)
class DatasetLoader:
"""Handle loading and processing of evaluation datasets."""
def __init__(self):
self.dataset = EvaluationDataset()
def load_from_csv(self,
file_path: str,
input_col: str = "input",
output_col: str = "expected_output",
context_col: Optional[str] = None) -> EvaluationDataset:
"""
Load dataset from CSV file with comprehensive logging.
Args:
file_path: Path to the CSV file
input_col: Column name for input questions
output_col: Column name for expected outputs
context_col: Optional column name for context
Returns:
EvaluationDataset: Loaded dataset
"""
try:
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"Dataset file not found: {file_path}")
raise FileNotFoundError(f"Dataset file not found: {file_path}")
logger.info(f"Loading dataset from: {file_path}")
# Read CSV file
df = pd.read_csv(file_path)
logger.info(f"CSV file loaded successfully. Shape: {df.shape}")
# Validate required columns
required_cols = [input_col, output_col]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
logger.error(f"Missing required columns: {missing_cols}")
logger.error(f"Available columns: {list(df.columns)}")
raise ValueError(f"Missing required columns: {missing_cols}")
# Log column information
logger.info(f"Dataset columns: {list(df.columns)}")
logger.info(f"Input column: {input_col}")
logger.info(f"Output column: {output_col}")
if context_col:
logger.info(f"Context column: {context_col}")
# Clean and validate data
df = self._clean_data(df, input_col, output_col)
# Load test cases
self.dataset.add_test_cases_from_csv_file(
file_path=str(file_path),
input_col_name=input_col,
actual_output_col_name=output_col,
)
logger.info(f"Successfully loaded {len(self.dataset.test_cases)} test cases")
# Log sample data
self._log_sample_data(df, input_col, output_col)
return self.dataset
except Exception as e:
logger.error(f"Error loading dataset: {e}")
raise
def _clean_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> pd.DataFrame:
"""Clean and validate dataset."""
logger.info("Cleaning dataset...")
initial_count = len(df)
# Remove rows with missing values in required columns
df = df.dropna(subset=[input_col, output_col])
# Remove empty strings
df = df[df[input_col].str.strip() != '']
df = df[df[output_col].str.strip() != '']
final_count = len(df)
removed_count = initial_count - final_count
if removed_count > 0:
logger.warning(f"Removed {removed_count} invalid rows during cleaning")
logger.info(f"Dataset cleaned. Final count: {final_count} rows")
return df
def _log_sample_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> None:
"""Log sample data for verification."""
logger.info("Sample data from dataset:")
for i, row in df.head(3).iterrows():
logger.info(f"Sample {i+1}:")
logger.info(f" Input: {row[input_col][:100]}...")
logger.info(f" Output: {row[output_col][:100]}...")
def get_dataset_stats(self) -> dict:
"""Get dataset statistics."""
if not self.dataset.test_cases:
return {"total_cases": 0}
stats = {
"total_cases": len(self.dataset.test_cases),
"avg_input_length": sum(len(case.input) for case in self.dataset.test_cases) / len(self.dataset.test_cases),
"avg_output_length": sum(len(case.actual_output or "") for case in self.dataset.test_cases) / len(self.dataset.test_cases)
}
logger.info(f"Dataset statistics: {stats}")
return stats