Spaces:

srishtichugh
/

OpenEnv_hack

Running

App Files Files Community

OpenEnv_hack / models.py

srishtichugh

add ui

40fcf49 2 days ago

raw

history blame contribute delete

2.89 kB

	from typing import Any, Dict, List, Optional
	from pydantic import BaseModel


	class DataCleaningAction(BaseModel):
	"""
	Action to apply to the current dirty DataFrame.

	operation choices:
	fill_missing – fill NaN values in a column
	drop_duplicates – remove duplicate rows
	fix_format – standardise string formats (phone, date, country)
	replace_value – replace a specific value with another
	drop_outliers – remove rows where column value is a statistical outlier
	fix_dtype – cast a column to the correct dtype
	align_schema – rename / reorder columns to match target schema (Task 4)
	merge_sources – merge the two aligned source DataFrames (Task 4)
	"""
	operation: str
	column: Optional[str] = None
	params: Dict[str, Any] = {}


	class DataQualityMetrics(BaseModel):
	"""Standard DQ dimensions — populated by /profile and embedded in every observation."""
	completeness_pct: float # % non-null cells across whole DataFrame
	uniqueness_pct: float # % rows that are not duplicates
	validity_pct: float # % cells passing format / dtype / range constraints
	total_cells: int
	null_cells: int
	duplicate_rows: int
	invalid_cells: int # format violations + dtype issues + out-of-range values


	class DataCleaningObservation(BaseModel):
	done: bool
	reward: float
	data_preview: str # First 10 rows as CSV string
	data_shape: List[int] # [rows, cols]
	missing_counts: Dict[str, int]
	duplicate_count: int
	dtype_issues: Dict[str, str]
	task_description: str
	message: str
	step_count: int
	current_score: float # Running grader score 0.0-1.0

	# --- Phase 2 additions ---
	dq_metrics: DataQualityMetrics # Live data quality vitals
	tried_operations: List[str] # e.g. ["fill_missing:age", "drop_duplicates"]
	plan: List[str] # Agent-facing recommended next 1-3 actions


	class DataCleaningState(BaseModel):
	episode_id: str
	task_id: int
	step_count: int
	max_steps: int
	total_errors: int
	errors_remaining: int


	class EpisodeReport(BaseModel):
	"""Returned by GET /report — full cleaning episode summary."""
	episode_id: str
	task_id: int
	task_name: str
	initial_score: float
	final_score: float
	score_improvement: float
	steps_taken: int
	max_steps: int
	step_efficiency_pct: float # How few steps used vs max (higher = better)
	operations_applied: List[str] # Ordered list of what was done
	issues_fixed: Dict[str, int] # e.g. {"nulls_filled": 40, "dupes_removed": 15}
	final_dq_metrics: DataQualityMetrics
	completed: bool # True if score >= 0.95