OpenEnv_hack / models.py
srishtichugh's picture
add ui
40fcf49
from typing import Any, Dict, List, Optional
from pydantic import BaseModel
class DataCleaningAction(BaseModel):
"""
Action to apply to the current dirty DataFrame.
operation choices:
fill_missing – fill NaN values in a column
drop_duplicates – remove duplicate rows
fix_format – standardise string formats (phone, date, country)
replace_value – replace a specific value with another
drop_outliers – remove rows where column value is a statistical outlier
fix_dtype – cast a column to the correct dtype
align_schema – rename / reorder columns to match target schema (Task 4)
merge_sources – merge the two aligned source DataFrames (Task 4)
"""
operation: str
column: Optional[str] = None
params: Dict[str, Any] = {}
class DataQualityMetrics(BaseModel):
"""Standard DQ dimensions β€” populated by /profile and embedded in every observation."""
completeness_pct: float # % non-null cells across whole DataFrame
uniqueness_pct: float # % rows that are not duplicates
validity_pct: float # % cells passing format / dtype / range constraints
total_cells: int
null_cells: int
duplicate_rows: int
invalid_cells: int # format violations + dtype issues + out-of-range values
class DataCleaningObservation(BaseModel):
done: bool
reward: float
data_preview: str # First 10 rows as CSV string
data_shape: List[int] # [rows, cols]
missing_counts: Dict[str, int]
duplicate_count: int
dtype_issues: Dict[str, str]
task_description: str
message: str
step_count: int
current_score: float # Running grader score 0.0-1.0
# --- Phase 2 additions ---
dq_metrics: DataQualityMetrics # Live data quality vitals
tried_operations: List[str] # e.g. ["fill_missing:age", "drop_duplicates"]
plan: List[str] # Agent-facing recommended next 1-3 actions
class DataCleaningState(BaseModel):
episode_id: str
task_id: int
step_count: int
max_steps: int
total_errors: int
errors_remaining: int
class EpisodeReport(BaseModel):
"""Returned by GET /report β€” full cleaning episode summary."""
episode_id: str
task_id: int
task_name: str
initial_score: float
final_score: float
score_improvement: float
steps_taken: int
max_steps: int
step_efficiency_pct: float # How few steps used vs max (higher = better)
operations_applied: List[str] # Ordered list of what was done
issues_fixed: Dict[str, int] # e.g. {"nulls_filled": 40, "dupes_removed": 15}
final_dq_metrics: DataQualityMetrics
completed: bool # True if score >= 0.95