Spaces:
Running
Running
| from typing import Any, Dict, List, Optional | |
| from pydantic import BaseModel | |
| class DataCleaningAction(BaseModel): | |
| """ | |
| Action to apply to the current dirty DataFrame. | |
| operation choices: | |
| fill_missing β fill NaN values in a column | |
| drop_duplicates β remove duplicate rows | |
| fix_format β standardise string formats (phone, date, country) | |
| replace_value β replace a specific value with another | |
| drop_outliers β remove rows where column value is a statistical outlier | |
| fix_dtype β cast a column to the correct dtype | |
| align_schema β rename / reorder columns to match target schema (Task 4) | |
| merge_sources β merge the two aligned source DataFrames (Task 4) | |
| """ | |
| operation: str | |
| column: Optional[str] = None | |
| params: Dict[str, Any] = {} | |
| class DataQualityMetrics(BaseModel): | |
| """Standard DQ dimensions β populated by /profile and embedded in every observation.""" | |
| completeness_pct: float # % non-null cells across whole DataFrame | |
| uniqueness_pct: float # % rows that are not duplicates | |
| validity_pct: float # % cells passing format / dtype / range constraints | |
| total_cells: int | |
| null_cells: int | |
| duplicate_rows: int | |
| invalid_cells: int # format violations + dtype issues + out-of-range values | |
| class DataCleaningObservation(BaseModel): | |
| done: bool | |
| reward: float | |
| data_preview: str # First 10 rows as CSV string | |
| data_shape: List[int] # [rows, cols] | |
| missing_counts: Dict[str, int] | |
| duplicate_count: int | |
| dtype_issues: Dict[str, str] | |
| task_description: str | |
| message: str | |
| step_count: int | |
| current_score: float # Running grader score 0.0-1.0 | |
| # --- Phase 2 additions --- | |
| dq_metrics: DataQualityMetrics # Live data quality vitals | |
| tried_operations: List[str] # e.g. ["fill_missing:age", "drop_duplicates"] | |
| plan: List[str] # Agent-facing recommended next 1-3 actions | |
| class DataCleaningState(BaseModel): | |
| episode_id: str | |
| task_id: int | |
| step_count: int | |
| max_steps: int | |
| total_errors: int | |
| errors_remaining: int | |
| class EpisodeReport(BaseModel): | |
| """Returned by GET /report β full cleaning episode summary.""" | |
| episode_id: str | |
| task_id: int | |
| task_name: str | |
| initial_score: float | |
| final_score: float | |
| score_improvement: float | |
| steps_taken: int | |
| max_steps: int | |
| step_efficiency_pct: float # How few steps used vs max (higher = better) | |
| operations_applied: List[str] # Ordered list of what was done | |
| issues_fixed: Dict[str, int] # e.g. {"nulls_filled": 40, "dupes_removed": 15} | |
| final_dq_metrics: DataQualityMetrics | |
| completed: bool # True if score >= 0.95 |