| | """Script to generate reference statistics for drift detection.""" |
| |
|
| | import logging |
| | import argparse |
| | from pathlib import Path |
| | import pandas as pd |
| |
|
| | from data.data_loader import load_data |
| | from utils.text_processing import normalise_text |
| | from monitoring.data_drift import DataDriftDetector |
| |
|
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def generate_reference_stats( |
| | data_path: str, |
| | output_path: str = "monitoring/reference_stats.json", |
| | ) -> None: |
| | """ |
| | Generate reference statistics from training data. |
| | |
| | Args: |
| | data_path: Path to training data TSV file |
| | output_path: Path to save reference statistics |
| | """ |
| | logger.info(f"Loading data from {data_path}") |
| | |
| | |
| | df, _, _ = load_data(data_path) |
| | |
| | |
| | df['title_clean'] = df['title'].apply(normalise_text) |
| | if 'snippet' in df.columns: |
| | df['snippet_clean'] = df['snippet'].fillna("").apply(normalise_text) |
| | else: |
| | df['snippet_clean'] = "" |
| | |
| | |
| | detector = DataDriftDetector(reference_data=df[['title_clean', 'snippet_clean']]) |
| | |
| | |
| | output_path = Path(output_path) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| | detector.save_reference_stats(str(output_path)) |
| | |
| | logger.info(f"Reference statistics saved to {output_path}") |
| | logger.info(f"Statistics computed for {len(df)} samples") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="Generate reference statistics") |
| | parser.add_argument( |
| | "--data-path", |
| | type=str, |
| | required=True, |
| | help="Path to training data TSV file" |
| | ) |
| | parser.add_argument( |
| | "--output", |
| | type=str, |
| | default="monitoring/reference_stats.json", |
| | help="Output path for reference statistics" |
| | ) |
| | |
| | args = parser.parse_args() |
| | |
| | generate_reference_stats( |
| | data_path=args.data_path, |
| | output_path=args.output, |
| | ) |
| |
|
| |
|
| |
|
| |
|
| |
|