Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +13 -0
- README.md +203 -5
- TEAMMATE_BASELINE.md +80 -0
- __init__.py +6 -0
- app.py +3 -0
- client.py +34 -0
- data/basic_cleaning.json +112 -0
- data/full_pipeline.json +817 -0
- data/moderate_cleaning.json +364 -0
- env/__init__.py +4 -0
- env/actions.py +180 -0
- env/environment.py +399 -0
- env/graders.py +13 -0
- env/models.py +40 -0
- env/quality.py +68 -0
- env/rewards.py +14 -0
- inference.py +181 -0
- models.py +5 -0
- openenv.yaml +29 -0
- pyproject.toml +31 -0
- requirements.txt +5 -0
- server/__init__.py +6 -0
- server/app.py +122 -0
- server/environment.py +5 -0
- server/requirements.txt +5 -0
- test_env.py +151 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
EXPOSE 7860
|
| 11 |
+
|
| 12 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 13 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,208 @@
|
|
| 1 |
---
|
| 2 |
-
title: Data Cleaning
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Data Cleaning OpenEnv Environment
|
| 3 |
+
emoji: 🧹
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 7860
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Data Cleaning OpenEnv Environment
|
| 15 |
+
|
| 16 |
+
## Overview
|
| 17 |
+
|
| 18 |
+
This repository contains a real-world OpenEnv benchmark for interactive tabular data cleaning. The agent operates on messy employee-style datasets and must resolve common data preparation issues step by step: missing values, duplicate rows, wrong dtypes, inconsistent categorical values, and derived feature creation.
|
| 19 |
+
|
| 20 |
+
The implementation uses plain Python data structures instead of pandas so it stays lightweight for the hackathon constraints, Docker validation, and Hugging Face Spaces deployment.
|
| 21 |
+
|
| 22 |
+
The repository now follows the standard OpenEnv layout closely:
|
| 23 |
+
|
| 24 |
+
```text
|
| 25 |
+
openenv-data-cleaning/
|
| 26 |
+
├── client.py
|
| 27 |
+
├── models.py
|
| 28 |
+
├── openenv.yaml
|
| 29 |
+
├── pyproject.toml
|
| 30 |
+
├── server/
|
| 31 |
+
│ ├── app.py
|
| 32 |
+
│ ├── environment.py
|
| 33 |
+
│ └── requirements.txt
|
| 34 |
+
└── outputs/
|
| 35 |
+
├── evals/
|
| 36 |
+
└── logs/
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Environment Summary
|
| 40 |
+
|
| 41 |
+
- Domain: tabular data cleaning and preparation
|
| 42 |
+
- Mode: simulation environment with `reset()`, `step()`, and `state()`
|
| 43 |
+
- API: FastAPI on port `7860`
|
| 44 |
+
- Tasks: `basic_cleaning`, `moderate_cleaning`, `full_pipeline`
|
| 45 |
+
- Difficulty curve: easy -> medium -> hard
|
| 46 |
+
|
| 47 |
+
## Action Space
|
| 48 |
+
|
| 49 |
+
| Action | Target | Required params | Validation rules |
|
| 50 |
+
| --- | --- | --- | --- |
|
| 51 |
+
| `fill_missing` | Specific column | `{"strategy": "mean" \| "median" \| "zero" \| "mode" \| "unknown"}` | Numeric columns allow `mean`, `median`, `zero`; categorical columns allow `mode`, `unknown`. |
|
| 52 |
+
| `drop_duplicates` | `__all__` | `{}` | Only valid when duplicate rows are still present. |
|
| 53 |
+
| `convert_dtype` | Specific column | `{"target_dtype": "int" \| "float" \| "str" \| "bool"}` | Target dtype must match the task configuration and values must be convertible. |
|
| 54 |
+
| `normalize_category` | Categorical column | `{}` | Only valid when case-only category inconsistencies remain. |
|
| 55 |
+
| `create_feature` | Registered feature name | `{"feature_name": "<name>"}` | Feature must be required by the task and its source column must already be clean enough to use. |
|
| 56 |
+
|
| 57 |
+
Invalid actions leave the dataset unchanged, emit `{"error": "invalid_action"}` in `info`, consume a step, and return reward `-0.05`.
|
| 58 |
+
|
| 59 |
+
## Observation and State Space
|
| 60 |
+
|
| 61 |
+
Every `reset()`, `step()`, and `state()` call returns the same typed observation payload:
|
| 62 |
+
|
| 63 |
+
| Field | Type | Description |
|
| 64 |
+
| --- | --- | --- |
|
| 65 |
+
| `data_preview` | `list[dict[str, Any]]` | First five rows of the current dataset |
|
| 66 |
+
| `columns` | `list[ColumnInfo]` | Per-column dtype, null count, and unique count |
|
| 67 |
+
| `pending_issues` | `list[Issue]` | Remaining fixable issues |
|
| 68 |
+
| `resolved_issues` | `list[Issue]` | Issues already credited as solved |
|
| 69 |
+
| `action_history` | `list[dict[str, Any]]` | Previous actions with reward and optional error |
|
| 70 |
+
| `quality_score` | `float` | Current quality score in `[0.0, 1.0]` |
|
| 71 |
+
| `steps_remaining` | `int` | Remaining episode budget |
|
| 72 |
+
| `total_rows` | `int` | Current number of rows |
|
| 73 |
+
| `total_issues_at_start` | `int` | Issues detected immediately after `reset()` |
|
| 74 |
+
|
| 75 |
+
## Tasks
|
| 76 |
+
|
| 77 |
+
| Task | Difficulty | Rows | Main issue profile |
|
| 78 |
+
| --- | --- | --- | --- |
|
| 79 |
+
| `basic_cleaning` | Easy | 20 | Missing `age`, missing `salary` |
|
| 80 |
+
| `moderate_cleaning` | Medium | 50 | Missing `age`, missing `salary`, missing `years_exp`, duplicate rows, wrong `salary` dtype |
|
| 81 |
+
| `full_pipeline` | Hard | 100 | Missing values, duplicate rows, wrong `salary` and `rating` dtypes, inconsistent `city`, inconsistent `department`, required `age_group` feature |
|
| 82 |
+
|
| 83 |
+
The hardest task includes explicit dependency chains such as fixing missing salary values before dtype conversion and cleaning source columns before feature creation.
|
| 84 |
+
|
| 85 |
+
## Reward and Grading
|
| 86 |
+
|
| 87 |
+
Step reward:
|
| 88 |
+
|
| 89 |
+
```text
|
| 90 |
+
reward = (new_quality - old_quality) + ordering_bonus - 0.01
|
| 91 |
+
ordering_bonus = 0.05 if dependencies were already satisfied else 0.0
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Dataset quality score combines:
|
| 95 |
+
|
| 96 |
+
- Completeness: 40%
|
| 97 |
+
- Uniqueness: 30%
|
| 98 |
+
- Consistency: 30%
|
| 99 |
+
|
| 100 |
+
Task grader:
|
| 101 |
+
|
| 102 |
+
```text
|
| 103 |
+
correctness = issues_fixed / total_issues
|
| 104 |
+
efficiency = max(0, 1 - steps_taken / (2 * total_issues))
|
| 105 |
+
penalty = wrong_actions * 0.05
|
| 106 |
+
score = 0.8 * correctness + 0.2 * efficiency - penalty
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
Grader scores are deterministic, clamped to `[0.0, 1.0]`, and rounded to two decimals.
|
| 110 |
+
|
| 111 |
+
## Setup
|
| 112 |
+
|
| 113 |
+
### Python and install
|
| 114 |
+
|
| 115 |
+
The project requires Python `3.10+`. Python `3.11` is recommended.
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
python3.11 -m venv .venv
|
| 119 |
+
source .venv/bin/activate
|
| 120 |
+
pip install -r requirements.txt
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Run local checks
|
| 124 |
+
|
| 125 |
+
```bash
|
| 126 |
+
python test_env.py
|
| 127 |
+
openenv validate .
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
### Run the FastAPI app
|
| 131 |
+
|
| 132 |
+
```bash
|
| 133 |
+
uv run server
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
Equivalent direct command:
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Run the baseline inference script
|
| 143 |
+
|
| 144 |
+
The hackathon evaluator expects these environment variables:
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
export HF_TOKEN=...
|
| 148 |
+
export API_BASE_URL=...
|
| 149 |
+
export MODEL_NAME=...
|
| 150 |
+
python inference.py
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
The script uses the OpenAI Python client and emits the required `[START]`, `[STEP]`, and `[END]` structured logs.
|
| 154 |
+
|
| 155 |
+
### Docker
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
docker build -t data-cleaning-env .
|
| 159 |
+
docker run -p 7860:7860 data-cleaning-env
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
## API Surface
|
| 163 |
+
|
| 164 |
+
- `GET /`
|
| 165 |
+
- `GET /health`
|
| 166 |
+
- `GET /metadata`
|
| 167 |
+
- `GET /tasks`
|
| 168 |
+
- `GET /schema`
|
| 169 |
+
- `POST /reset`
|
| 170 |
+
- `POST /step`
|
| 171 |
+
- `GET /state`
|
| 172 |
+
- `POST /mcp`
|
| 173 |
+
|
| 174 |
+
## Baseline Scores
|
| 175 |
+
|
| 176 |
+
Deterministic scripted benchmark from `test_env.py`:
|
| 177 |
+
|
| 178 |
+
- `basic_cleaning`: `0.90`
|
| 179 |
+
- `moderate_cleaning`: `0.90`
|
| 180 |
+
- `full_pipeline`: `0.90`
|
| 181 |
+
|
| 182 |
+
Model-based baseline from `inference.py`:
|
| 183 |
+
|
| 184 |
+
- `basic_cleaning`: `0.90`
|
| 185 |
+
- `moderate_cleaning`: `0.41`
|
| 186 |
+
- `full_pipeline`: `0.20`
|
| 187 |
+
|
| 188 |
+
These scores were produced on April 8, 2026 using `MODEL_NAME=Qwen/Qwen2.5-72B-Instruct` through the configured Hugging Face router. The run completed and emitted the required structured logs, but the provider returned HTTP `402` after the early steps, so the medium and hard tasks were penalized by fallback `parse_error` actions. For a stronger final baseline, top up credits or switch `API_BASE_URL` / `MODEL_NAME` to a provider with available quota and rerun `python inference.py`.
|
| 189 |
+
|
| 190 |
+
## Deployment
|
| 191 |
+
|
| 192 |
+
### Hugging Face Spaces
|
| 193 |
+
|
| 194 |
+
Deploy this repo as a Docker Space tagged with OpenEnv. After deployment, verify:
|
| 195 |
+
|
| 196 |
+
- the Space root responds with HTTP `200`
|
| 197 |
+
- `POST /reset` works on the live Space
|
| 198 |
+
- `openenv validate <space-url>` passes runtime validation
|
| 199 |
+
|
| 200 |
+
Recommended deploy command:
|
| 201 |
+
|
| 202 |
+
```bash
|
| 203 |
+
openenv push --repo-id kaustubhg73/data-cleaning-openenv --exclude .openenv-upload-ignore
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
Space link:
|
| 207 |
+
|
| 208 |
+
- https://huggingface.co/spaces/kaustubhg73/data-cleaning-openenv
|
TEAMMATE_BASELINE.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Teammate Baseline Runbook
|
| 2 |
+
|
| 3 |
+
Use this if the current Hugging Face router quota is exhausted and you want to rerun the official baseline with a different token or provider.
|
| 4 |
+
|
| 5 |
+
## What is already done
|
| 6 |
+
|
| 7 |
+
- The OpenEnv environment is implemented and validated.
|
| 8 |
+
- The Hugging Face Space is live:
|
| 9 |
+
- `https://huggingface.co/spaces/kaustubhg73/data-cleaning-openenv`
|
| 10 |
+
- Local validation, Docker validation, and live runtime validation already passed.
|
| 11 |
+
|
| 12 |
+
## What you need
|
| 13 |
+
|
| 14 |
+
Set these environment variables before running `inference.py`:
|
| 15 |
+
|
| 16 |
+
```bash
|
| 17 |
+
export HF_TOKEN=...
|
| 18 |
+
export API_BASE_URL=...
|
| 19 |
+
export MODEL_NAME=...
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Important:
|
| 23 |
+
|
| 24 |
+
- `inference.py` uses the OpenAI Python client.
|
| 25 |
+
- In this repo, `HF_TOKEN` is the actual API key variable used by the client.
|
| 26 |
+
- A standard Hugging Face router configuration is:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 30 |
+
export MODEL_NAME="openai/gpt-oss-120b"
|
| 31 |
+
export HF_TOKEN="HF_TOKEN"
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Local setup
|
| 35 |
+
|
| 36 |
+
From the repo root:
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
python3.11 -m venv .venv311
|
| 40 |
+
source .venv311/bin/activate
|
| 41 |
+
pip install -r requirements.txt
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Run the baseline
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
python inference.py
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
The required log format is:
|
| 51 |
+
|
| 52 |
+
- `[START]`
|
| 53 |
+
- `[STEP]`
|
| 54 |
+
- `[END]`
|
| 55 |
+
|
| 56 |
+
Do not change the log format before submission.
|
| 57 |
+
|
| 58 |
+
## Expected follow-up after a successful run
|
| 59 |
+
|
| 60 |
+
Update the model-based baseline section in `README.md` with:
|
| 61 |
+
|
| 62 |
+
- the final scores for all three tasks
|
| 63 |
+
- the model name used
|
| 64 |
+
- a short note that the run completed successfully
|
| 65 |
+
|
| 66 |
+
## Optional validation checks
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
python test_env.py
|
| 70 |
+
openenv validate .
|
| 71 |
+
openenv validate --url https://kaustubhg73-data-cleaning-openenv.hf.space
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## If you need to redeploy
|
| 75 |
+
|
| 76 |
+
Use the exclude file so the local `OpenEnv/` tutorial folder is not uploaded:
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
openenv push --repo-id kaustubhg73/data-cleaning-openenv --exclude .openenv-upload-ignore
|
| 80 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv data cleaning environment package exports."""
|
| 2 |
+
|
| 3 |
+
from client import DataCleaningEnvClient
|
| 4 |
+
from models import Action, Observation
|
| 5 |
+
|
| 6 |
+
__all__ = ["Action", "Observation", "DataCleaningEnvClient"]
|
app.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.app import app, main
|
| 2 |
+
|
| 3 |
+
__all__ = ["app", "main"]
|
client.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Thin client helpers for local development and OpenEnv packaging."""
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataCleaningEnvClient:
|
| 9 |
+
"""Minimal HTTP client for smoke-testing the environment locally."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, base_url: str = "http://localhost:7860"):
|
| 12 |
+
self.base_url = base_url.rstrip("/")
|
| 13 |
+
self._client = httpx.Client(base_url=self.base_url, timeout=30.0)
|
| 14 |
+
|
| 15 |
+
def close(self) -> None:
|
| 16 |
+
self._client.close()
|
| 17 |
+
|
| 18 |
+
def reset(self, task_name: str = "basic_cleaning") -> dict[str, Any]:
|
| 19 |
+
response = self._client.post("/reset", json={"task_name": task_name})
|
| 20 |
+
response.raise_for_status()
|
| 21 |
+
return response.json()
|
| 22 |
+
|
| 23 |
+
def step(self, payload: dict[str, Any]) -> dict[str, Any]:
|
| 24 |
+
response = self._client.post("/step", json=payload)
|
| 25 |
+
response.raise_for_status()
|
| 26 |
+
return response.json()
|
| 27 |
+
|
| 28 |
+
def state(self) -> dict[str, Any]:
|
| 29 |
+
response = self._client.get("/state")
|
| 30 |
+
response.raise_for_status()
|
| 31 |
+
return response.json()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
__all__ = ["DataCleaningEnvClient"]
|
data/basic_cleaning.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"task_name": "basic_cleaning",
|
| 3 |
+
"max_steps": 6,
|
| 4 |
+
"expected_dtypes": {
|
| 5 |
+
"age": "int",
|
| 6 |
+
"salary": "int",
|
| 7 |
+
"city": "str"
|
| 8 |
+
},
|
| 9 |
+
"required_features": [],
|
| 10 |
+
"dataset": [
|
| 11 |
+
{
|
| 12 |
+
"age": 25,
|
| 13 |
+
"salary": 50000,
|
| 14 |
+
"city": "Mumbai"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"age": null,
|
| 18 |
+
"salary": 60000,
|
| 19 |
+
"city": "Delhi"
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"age": 30,
|
| 23 |
+
"salary": null,
|
| 24 |
+
"city": "Mumbai"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"age": 22,
|
| 28 |
+
"salary": 45000,
|
| 29 |
+
"city": "Bangalore"
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"age": null,
|
| 33 |
+
"salary": 55000,
|
| 34 |
+
"city": "Delhi"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"age": 28,
|
| 38 |
+
"salary": 70000,
|
| 39 |
+
"city": "Mumbai"
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"age": 35,
|
| 43 |
+
"salary": null,
|
| 44 |
+
"city": "Bangalore"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"age": null,
|
| 48 |
+
"salary": 48000,
|
| 49 |
+
"city": "Delhi"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"age": 26,
|
| 53 |
+
"salary": 52000,
|
| 54 |
+
"city": "Mumbai"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"age": 31,
|
| 58 |
+
"salary": null,
|
| 59 |
+
"city": "Bangalore"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"age": 29,
|
| 63 |
+
"salary": 62000,
|
| 64 |
+
"city": "Delhi"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"age": null,
|
| 68 |
+
"salary": 43000,
|
| 69 |
+
"city": "Mumbai"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"age": 24,
|
| 73 |
+
"salary": 51000,
|
| 74 |
+
"city": "Bangalore"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"age": 33,
|
| 78 |
+
"salary": null,
|
| 79 |
+
"city": "Delhi"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"age": 27,
|
| 83 |
+
"salary": 58000,
|
| 84 |
+
"city": "Mumbai"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"age": null,
|
| 88 |
+
"salary": 47000,
|
| 89 |
+
"city": "Bangalore"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"age": 32,
|
| 93 |
+
"salary": 65000,
|
| 94 |
+
"city": "Delhi"
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"age": 23,
|
| 98 |
+
"salary": null,
|
| 99 |
+
"city": "Mumbai"
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"age": 36,
|
| 103 |
+
"salary": 72000,
|
| 104 |
+
"city": "Bangalore"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"age": 28,
|
| 108 |
+
"salary": 53000,
|
| 109 |
+
"city": "Delhi"
|
| 110 |
+
}
|
| 111 |
+
]
|
| 112 |
+
}
|
data/full_pipeline.json
ADDED
|
@@ -0,0 +1,817 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"task_name": "full_pipeline",
|
| 3 |
+
"max_steps": 15,
|
| 4 |
+
"expected_dtypes": {
|
| 5 |
+
"age": "int",
|
| 6 |
+
"salary": "int",
|
| 7 |
+
"city": "str",
|
| 8 |
+
"department": "str",
|
| 9 |
+
"years_exp": "int",
|
| 10 |
+
"rating": "float"
|
| 11 |
+
},
|
| 12 |
+
"required_features": [
|
| 13 |
+
"age_group"
|
| 14 |
+
],
|
| 15 |
+
"dataset": [
|
| 16 |
+
{
|
| 17 |
+
"age": 24,
|
| 18 |
+
"salary": "42000",
|
| 19 |
+
"city": "mumbai",
|
| 20 |
+
"department": "engineering",
|
| 21 |
+
"years_exp": 2,
|
| 22 |
+
"rating": "3.6"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"age": 27,
|
| 26 |
+
"salary": "43600",
|
| 27 |
+
"city": "Delhi",
|
| 28 |
+
"department": "ENGINEERING",
|
| 29 |
+
"years_exp": 4,
|
| 30 |
+
"rating": "3.9"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"age": 30,
|
| 34 |
+
"salary": "not_available",
|
| 35 |
+
"city": "Bangalore",
|
| 36 |
+
"department": "Sales",
|
| 37 |
+
"years_exp": 6,
|
| 38 |
+
"rating": "4.2"
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"age": null,
|
| 42 |
+
"salary": "46800",
|
| 43 |
+
"city": "pune",
|
| 44 |
+
"department": "sales",
|
| 45 |
+
"years_exp": 8,
|
| 46 |
+
"rating": "4.5"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"age": 36,
|
| 50 |
+
"salary": "47400",
|
| 51 |
+
"city": "Chennai",
|
| 52 |
+
"department": "MARKETING",
|
| 53 |
+
"years_exp": 10,
|
| 54 |
+
"rating": "3.6"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"age": 39,
|
| 58 |
+
"salary": "49000",
|
| 59 |
+
"city": "Hyderabad",
|
| 60 |
+
"department": "Marketing",
|
| 61 |
+
"years_exp": null,
|
| 62 |
+
"rating": "3.9"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"age": 42,
|
| 66 |
+
"salary": "50600",
|
| 67 |
+
"city": "Mumbai",
|
| 68 |
+
"department": "finance",
|
| 69 |
+
"years_exp": 14,
|
| 70 |
+
"rating": "4.2"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"age": 24,
|
| 74 |
+
"salary": "52200",
|
| 75 |
+
"city": "delhi",
|
| 76 |
+
"department": "FINANCE",
|
| 77 |
+
"years_exp": 16,
|
| 78 |
+
"rating": null
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"age": 27,
|
| 82 |
+
"salary": "52800",
|
| 83 |
+
"city": "BANGALORE",
|
| 84 |
+
"department": "Support",
|
| 85 |
+
"years_exp": 18,
|
| 86 |
+
"rating": "3.6"
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"age": 30,
|
| 90 |
+
"salary": "54400",
|
| 91 |
+
"city": "Pune",
|
| 92 |
+
"department": "support",
|
| 93 |
+
"years_exp": 3,
|
| 94 |
+
"rating": "3.9"
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"age": 33,
|
| 98 |
+
"salary": "56000",
|
| 99 |
+
"city": "chennai",
|
| 100 |
+
"department": "OPERATIONS",
|
| 101 |
+
"years_exp": 5,
|
| 102 |
+
"rating": "4.2"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"age": null,
|
| 106 |
+
"salary": "57600",
|
| 107 |
+
"city": "HYDERABAD",
|
| 108 |
+
"department": "Operations",
|
| 109 |
+
"years_exp": 7,
|
| 110 |
+
"rating": "4.5"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"age": 39,
|
| 114 |
+
"salary": "58200",
|
| 115 |
+
"city": "Mumbai",
|
| 116 |
+
"department": "engineering",
|
| 117 |
+
"years_exp": 9,
|
| 118 |
+
"rating": "3.6"
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"age": 42,
|
| 122 |
+
"salary": "59800",
|
| 123 |
+
"city": "delhi",
|
| 124 |
+
"department": "ENGINEERING",
|
| 125 |
+
"years_exp": 11,
|
| 126 |
+
"rating": "3.9"
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"age": 24,
|
| 130 |
+
"salary": "61400",
|
| 131 |
+
"city": "BANGALORE",
|
| 132 |
+
"department": "Sales",
|
| 133 |
+
"years_exp": null,
|
| 134 |
+
"rating": "4.2"
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"age": 27,
|
| 138 |
+
"salary": "63000",
|
| 139 |
+
"city": "Pune",
|
| 140 |
+
"department": "sales",
|
| 141 |
+
"years_exp": 15,
|
| 142 |
+
"rating": "4.5"
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"age": 30,
|
| 146 |
+
"salary": "not_available",
|
| 147 |
+
"city": "chennai",
|
| 148 |
+
"department": "MARKETING",
|
| 149 |
+
"years_exp": 17,
|
| 150 |
+
"rating": "3.6"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"age": 33,
|
| 154 |
+
"salary": "65200",
|
| 155 |
+
"city": "HYDERABAD",
|
| 156 |
+
"department": "Marketing",
|
| 157 |
+
"years_exp": 2,
|
| 158 |
+
"rating": "3.9"
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"age": 36,
|
| 162 |
+
"salary": "66800",
|
| 163 |
+
"city": "Mumbai",
|
| 164 |
+
"department": "finance",
|
| 165 |
+
"years_exp": 4,
|
| 166 |
+
"rating": null
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"age": null,
|
| 170 |
+
"salary": "68400",
|
| 171 |
+
"city": "delhi",
|
| 172 |
+
"department": "FINANCE",
|
| 173 |
+
"years_exp": 6,
|
| 174 |
+
"rating": "4.5"
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"age": 42,
|
| 178 |
+
"salary": "69000",
|
| 179 |
+
"city": "BANGALORE",
|
| 180 |
+
"department": "Support",
|
| 181 |
+
"years_exp": 8,
|
| 182 |
+
"rating": "3.6"
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"age": 24,
|
| 186 |
+
"salary": "70600",
|
| 187 |
+
"city": "Pune",
|
| 188 |
+
"department": "support",
|
| 189 |
+
"years_exp": 10,
|
| 190 |
+
"rating": "3.9"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"age": 27,
|
| 194 |
+
"salary": "72200",
|
| 195 |
+
"city": "chennai",
|
| 196 |
+
"department": "OPERATIONS",
|
| 197 |
+
"years_exp": null,
|
| 198 |
+
"rating": "4.2"
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"age": 30,
|
| 202 |
+
"salary": "73800",
|
| 203 |
+
"city": "HYDERABAD",
|
| 204 |
+
"department": "Operations",
|
| 205 |
+
"years_exp": 14,
|
| 206 |
+
"rating": "4.5"
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"age": 33,
|
| 210 |
+
"salary": "not_available",
|
| 211 |
+
"city": "Mumbai",
|
| 212 |
+
"department": "engineering",
|
| 213 |
+
"years_exp": 16,
|
| 214 |
+
"rating": "3.6"
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"age": 36,
|
| 218 |
+
"salary": "76000",
|
| 219 |
+
"city": "delhi",
|
| 220 |
+
"department": "ENGINEERING",
|
| 221 |
+
"years_exp": 18,
|
| 222 |
+
"rating": "3.9"
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"age": 39,
|
| 226 |
+
"salary": "77600",
|
| 227 |
+
"city": "BANGALORE",
|
| 228 |
+
"department": "Sales",
|
| 229 |
+
"years_exp": 3,
|
| 230 |
+
"rating": "4.2"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"age": null,
|
| 234 |
+
"salary": "79200",
|
| 235 |
+
"city": "Pune",
|
| 236 |
+
"department": "sales",
|
| 237 |
+
"years_exp": 5,
|
| 238 |
+
"rating": "4.5"
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"age": 24,
|
| 242 |
+
"salary": "79800",
|
| 243 |
+
"city": "chennai",
|
| 244 |
+
"department": "MARKETING",
|
| 245 |
+
"years_exp": 7,
|
| 246 |
+
"rating": "3.6"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"age": 27,
|
| 250 |
+
"salary": "81400",
|
| 251 |
+
"city": "HYDERABAD",
|
| 252 |
+
"department": "Marketing",
|
| 253 |
+
"years_exp": 9,
|
| 254 |
+
"rating": null
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"age": 31,
|
| 258 |
+
"salary": "83000",
|
| 259 |
+
"city": "Mumbai",
|
| 260 |
+
"department": "finance",
|
| 261 |
+
"years_exp": null,
|
| 262 |
+
"rating": "4.2"
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"age": 34,
|
| 266 |
+
"salary": "84600",
|
| 267 |
+
"city": "delhi",
|
| 268 |
+
"department": "FINANCE",
|
| 269 |
+
"years_exp": 13,
|
| 270 |
+
"rating": "4.5"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"age": 37,
|
| 274 |
+
"salary": "85200",
|
| 275 |
+
"city": "BANGALORE",
|
| 276 |
+
"department": "Support",
|
| 277 |
+
"years_exp": 15,
|
| 278 |
+
"rating": "3.6"
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"age": 40,
|
| 282 |
+
"salary": "not_available",
|
| 283 |
+
"city": "Pune",
|
| 284 |
+
"department": "support",
|
| 285 |
+
"years_exp": 17,
|
| 286 |
+
"rating": "3.9"
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"age": 43,
|
| 290 |
+
"salary": "88400",
|
| 291 |
+
"city": "chennai",
|
| 292 |
+
"department": "OPERATIONS",
|
| 293 |
+
"years_exp": 2,
|
| 294 |
+
"rating": "4.2"
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"age": null,
|
| 298 |
+
"salary": "90000",
|
| 299 |
+
"city": "HYDERABAD",
|
| 300 |
+
"department": "Operations",
|
| 301 |
+
"years_exp": 4,
|
| 302 |
+
"rating": "4.5"
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"age": 28,
|
| 306 |
+
"salary": "90600",
|
| 307 |
+
"city": "Mumbai",
|
| 308 |
+
"department": "engineering",
|
| 309 |
+
"years_exp": 6,
|
| 310 |
+
"rating": "3.6"
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"age": 31,
|
| 314 |
+
"salary": "92200",
|
| 315 |
+
"city": "delhi",
|
| 316 |
+
"department": "ENGINEERING",
|
| 317 |
+
"years_exp": 8,
|
| 318 |
+
"rating": "3.9"
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"age": 34,
|
| 322 |
+
"salary": "93800",
|
| 323 |
+
"city": "BANGALORE",
|
| 324 |
+
"department": "Sales",
|
| 325 |
+
"years_exp": null,
|
| 326 |
+
"rating": "4.2"
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
"age": 37,
|
| 330 |
+
"salary": "95400",
|
| 331 |
+
"city": "Pune",
|
| 332 |
+
"department": "sales",
|
| 333 |
+
"years_exp": 12,
|
| 334 |
+
"rating": "4.5"
|
| 335 |
+
},
|
| 336 |
+
{
|
| 337 |
+
"age": 40,
|
| 338 |
+
"salary": "96000",
|
| 339 |
+
"city": "chennai",
|
| 340 |
+
"department": "MARKETING",
|
| 341 |
+
"years_exp": 14,
|
| 342 |
+
"rating": null
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"age": 43,
|
| 346 |
+
"salary": "97600",
|
| 347 |
+
"city": "HYDERABAD",
|
| 348 |
+
"department": "Marketing",
|
| 349 |
+
"years_exp": 16,
|
| 350 |
+
"rating": "3.9"
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"age": 25,
|
| 354 |
+
"salary": "99200",
|
| 355 |
+
"city": "Mumbai",
|
| 356 |
+
"department": "finance",
|
| 357 |
+
"years_exp": 18,
|
| 358 |
+
"rating": "4.2"
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"age": null,
|
| 362 |
+
"salary": "100800",
|
| 363 |
+
"city": "delhi",
|
| 364 |
+
"department": "FINANCE",
|
| 365 |
+
"years_exp": 3,
|
| 366 |
+
"rating": "4.5"
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"age": 31,
|
| 370 |
+
"salary": "101400",
|
| 371 |
+
"city": "BANGALORE",
|
| 372 |
+
"department": "Support",
|
| 373 |
+
"years_exp": 5,
|
| 374 |
+
"rating": "3.6"
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"age": 34,
|
| 378 |
+
"salary": "103000",
|
| 379 |
+
"city": "Pune",
|
| 380 |
+
"department": "support",
|
| 381 |
+
"years_exp": 7,
|
| 382 |
+
"rating": "3.9"
|
| 383 |
+
},
|
| 384 |
+
{
|
| 385 |
+
"age": 37,
|
| 386 |
+
"salary": "104600",
|
| 387 |
+
"city": "chennai",
|
| 388 |
+
"department": "OPERATIONS",
|
| 389 |
+
"years_exp": null,
|
| 390 |
+
"rating": "4.2"
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"age": 40,
|
| 394 |
+
"salary": "106200",
|
| 395 |
+
"city": "HYDERABAD",
|
| 396 |
+
"department": "Operations",
|
| 397 |
+
"years_exp": 11,
|
| 398 |
+
"rating": "4.5"
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"age": 43,
|
| 402 |
+
"salary": "not_available",
|
| 403 |
+
"city": "Mumbai",
|
| 404 |
+
"department": "engineering",
|
| 405 |
+
"years_exp": 13,
|
| 406 |
+
"rating": "3.6"
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"age": 25,
|
| 410 |
+
"salary": "108400",
|
| 411 |
+
"city": "delhi",
|
| 412 |
+
"department": "ENGINEERING",
|
| 413 |
+
"years_exp": 15,
|
| 414 |
+
"rating": "3.9"
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"age": 28,
|
| 418 |
+
"salary": "110000",
|
| 419 |
+
"city": "BANGALORE",
|
| 420 |
+
"department": "Sales",
|
| 421 |
+
"years_exp": 17,
|
| 422 |
+
"rating": "4.2"
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"age": null,
|
| 426 |
+
"salary": "111600",
|
| 427 |
+
"city": "Pune",
|
| 428 |
+
"department": "sales",
|
| 429 |
+
"years_exp": 2,
|
| 430 |
+
"rating": "4.5"
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"age": 34,
|
| 434 |
+
"salary": "112200",
|
| 435 |
+
"city": "chennai",
|
| 436 |
+
"department": "MARKETING",
|
| 437 |
+
"years_exp": 4,
|
| 438 |
+
"rating": null
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"age": 37,
|
| 442 |
+
"salary": "113800",
|
| 443 |
+
"city": "HYDERABAD",
|
| 444 |
+
"department": "Marketing",
|
| 445 |
+
"years_exp": 6,
|
| 446 |
+
"rating": "3.9"
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"age": 40,
|
| 450 |
+
"salary": "115400",
|
| 451 |
+
"city": "Mumbai",
|
| 452 |
+
"department": "finance",
|
| 453 |
+
"years_exp": null,
|
| 454 |
+
"rating": "4.2"
|
| 455 |
+
},
|
| 456 |
+
{
|
| 457 |
+
"age": 43,
|
| 458 |
+
"salary": "117000",
|
| 459 |
+
"city": "delhi",
|
| 460 |
+
"department": "FINANCE",
|
| 461 |
+
"years_exp": 10,
|
| 462 |
+
"rating": "4.5"
|
| 463 |
+
},
|
| 464 |
+
{
|
| 465 |
+
"age": 25,
|
| 466 |
+
"salary": "117600",
|
| 467 |
+
"city": "BANGALORE",
|
| 468 |
+
"department": "Support",
|
| 469 |
+
"years_exp": 12,
|
| 470 |
+
"rating": "3.6"
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"age": 28,
|
| 474 |
+
"salary": "not_available",
|
| 475 |
+
"city": "Pune",
|
| 476 |
+
"department": "support",
|
| 477 |
+
"years_exp": 14,
|
| 478 |
+
"rating": "3.9"
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"age": 31,
|
| 482 |
+
"salary": "120800",
|
| 483 |
+
"city": "chennai",
|
| 484 |
+
"department": "OPERATIONS",
|
| 485 |
+
"years_exp": 16,
|
| 486 |
+
"rating": "4.2"
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"age": null,
|
| 490 |
+
"salary": "122400",
|
| 491 |
+
"city": "HYDERABAD",
|
| 492 |
+
"department": "Operations",
|
| 493 |
+
"years_exp": 18,
|
| 494 |
+
"rating": "4.5"
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"age": 38,
|
| 498 |
+
"salary": "123000",
|
| 499 |
+
"city": "Mumbai",
|
| 500 |
+
"department": "engineering",
|
| 501 |
+
"years_exp": 3,
|
| 502 |
+
"rating": "3.6"
|
| 503 |
+
},
|
| 504 |
+
{
|
| 505 |
+
"age": 41,
|
| 506 |
+
"salary": "124600",
|
| 507 |
+
"city": "delhi",
|
| 508 |
+
"department": "ENGINEERING",
|
| 509 |
+
"years_exp": 5,
|
| 510 |
+
"rating": "3.9"
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"age": 44,
|
| 514 |
+
"salary": "126200",
|
| 515 |
+
"city": "BANGALORE",
|
| 516 |
+
"department": "Sales",
|
| 517 |
+
"years_exp": null,
|
| 518 |
+
"rating": "4.2"
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"age": 26,
|
| 522 |
+
"salary": "127800",
|
| 523 |
+
"city": "Pune",
|
| 524 |
+
"department": "sales",
|
| 525 |
+
"years_exp": 9,
|
| 526 |
+
"rating": "4.5"
|
| 527 |
+
},
|
| 528 |
+
{
|
| 529 |
+
"age": 29,
|
| 530 |
+
"salary": "128400",
|
| 531 |
+
"city": "chennai",
|
| 532 |
+
"department": "MARKETING",
|
| 533 |
+
"years_exp": 11,
|
| 534 |
+
"rating": null
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"age": 32,
|
| 538 |
+
"salary": "130000",
|
| 539 |
+
"city": "HYDERABAD",
|
| 540 |
+
"department": "Marketing",
|
| 541 |
+
"years_exp": 13,
|
| 542 |
+
"rating": "3.9"
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"age": 35,
|
| 546 |
+
"salary": "131600",
|
| 547 |
+
"city": "Mumbai",
|
| 548 |
+
"department": "finance",
|
| 549 |
+
"years_exp": 15,
|
| 550 |
+
"rating": "4.2"
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"age": null,
|
| 554 |
+
"salary": "133200",
|
| 555 |
+
"city": "delhi",
|
| 556 |
+
"department": "FINANCE",
|
| 557 |
+
"years_exp": 17,
|
| 558 |
+
"rating": "4.5"
|
| 559 |
+
},
|
| 560 |
+
{
|
| 561 |
+
"age": 41,
|
| 562 |
+
"salary": "133800",
|
| 563 |
+
"city": "BANGALORE",
|
| 564 |
+
"department": "Support",
|
| 565 |
+
"years_exp": 2,
|
| 566 |
+
"rating": "3.6"
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"age": 44,
|
| 570 |
+
"salary": "not_available",
|
| 571 |
+
"city": "Pune",
|
| 572 |
+
"department": "support",
|
| 573 |
+
"years_exp": 4,
|
| 574 |
+
"rating": "3.9"
|
| 575 |
+
},
|
| 576 |
+
{
|
| 577 |
+
"age": 26,
|
| 578 |
+
"salary": "137000",
|
| 579 |
+
"city": "chennai",
|
| 580 |
+
"department": "OPERATIONS",
|
| 581 |
+
"years_exp": null,
|
| 582 |
+
"rating": "4.2"
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"age": 29,
|
| 586 |
+
"salary": "138600",
|
| 587 |
+
"city": "HYDERABAD",
|
| 588 |
+
"department": "Operations",
|
| 589 |
+
"years_exp": 8,
|
| 590 |
+
"rating": "4.5"
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"age": 32,
|
| 594 |
+
"salary": "139200",
|
| 595 |
+
"city": "Mumbai",
|
| 596 |
+
"department": "engineering",
|
| 597 |
+
"years_exp": 10,
|
| 598 |
+
"rating": "3.6"
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"age": 35,
|
| 602 |
+
"salary": "140800",
|
| 603 |
+
"city": "delhi",
|
| 604 |
+
"department": "ENGINEERING",
|
| 605 |
+
"years_exp": 12,
|
| 606 |
+
"rating": "3.9"
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
"age": 38,
|
| 610 |
+
"salary": "142400",
|
| 611 |
+
"city": "BANGALORE",
|
| 612 |
+
"department": "Sales",
|
| 613 |
+
"years_exp": 14,
|
| 614 |
+
"rating": "4.2"
|
| 615 |
+
},
|
| 616 |
+
{
|
| 617 |
+
"age": null,
|
| 618 |
+
"salary": "144000",
|
| 619 |
+
"city": "Pune",
|
| 620 |
+
"department": "sales",
|
| 621 |
+
"years_exp": 16,
|
| 622 |
+
"rating": "4.5"
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"age": 44,
|
| 626 |
+
"salary": "144600",
|
| 627 |
+
"city": "chennai",
|
| 628 |
+
"department": "MARKETING",
|
| 629 |
+
"years_exp": 18,
|
| 630 |
+
"rating": "3.6"
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
"age": 26,
|
| 634 |
+
"salary": "146200",
|
| 635 |
+
"city": "HYDERABAD",
|
| 636 |
+
"department": "Marketing",
|
| 637 |
+
"years_exp": 3,
|
| 638 |
+
"rating": "3.9"
|
| 639 |
+
},
|
| 640 |
+
{
|
| 641 |
+
"age": 29,
|
| 642 |
+
"salary": "147800",
|
| 643 |
+
"city": "Mumbai",
|
| 644 |
+
"department": "finance",
|
| 645 |
+
"years_exp": 5,
|
| 646 |
+
"rating": null
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"age": 32,
|
| 650 |
+
"salary": "149400",
|
| 651 |
+
"city": "delhi",
|
| 652 |
+
"department": "FINANCE",
|
| 653 |
+
"years_exp": 7,
|
| 654 |
+
"rating": "4.5"
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"age": 35,
|
| 658 |
+
"salary": "150000",
|
| 659 |
+
"city": "BANGALORE",
|
| 660 |
+
"department": "Support",
|
| 661 |
+
"years_exp": 9,
|
| 662 |
+
"rating": "3.6"
|
| 663 |
+
},
|
| 664 |
+
{
|
| 665 |
+
"age": 38,
|
| 666 |
+
"salary": "not_available",
|
| 667 |
+
"city": "Pune",
|
| 668 |
+
"department": "support",
|
| 669 |
+
"years_exp": 11,
|
| 670 |
+
"rating": "3.9"
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"age": 41,
|
| 674 |
+
"salary": "153200",
|
| 675 |
+
"city": "chennai",
|
| 676 |
+
"department": "OPERATIONS",
|
| 677 |
+
"years_exp": 13,
|
| 678 |
+
"rating": "4.2"
|
| 679 |
+
},
|
| 680 |
+
{
|
| 681 |
+
"age": null,
|
| 682 |
+
"salary": "154800",
|
| 683 |
+
"city": "HYDERABAD",
|
| 684 |
+
"department": "Operations",
|
| 685 |
+
"years_exp": 15,
|
| 686 |
+
"rating": "4.5"
|
| 687 |
+
},
|
| 688 |
+
{
|
| 689 |
+
"age": 26,
|
| 690 |
+
"salary": "155400",
|
| 691 |
+
"city": "Mumbai",
|
| 692 |
+
"department": "engineering",
|
| 693 |
+
"years_exp": 17,
|
| 694 |
+
"rating": "3.6"
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"age": 29,
|
| 698 |
+
"salary": "157000",
|
| 699 |
+
"city": "delhi",
|
| 700 |
+
"department": "ENGINEERING",
|
| 701 |
+
"years_exp": 2,
|
| 702 |
+
"rating": "3.9"
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"age": 32,
|
| 706 |
+
"salary": "158600",
|
| 707 |
+
"city": "BANGALORE",
|
| 708 |
+
"department": "Sales",
|
| 709 |
+
"years_exp": null,
|
| 710 |
+
"rating": "4.2"
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"age": 35,
|
| 714 |
+
"salary": "160200",
|
| 715 |
+
"city": "Pune",
|
| 716 |
+
"department": "sales",
|
| 717 |
+
"years_exp": 6,
|
| 718 |
+
"rating": "4.5"
|
| 719 |
+
},
|
| 720 |
+
{
|
| 721 |
+
"age": 38,
|
| 722 |
+
"salary": "160800",
|
| 723 |
+
"city": "chennai",
|
| 724 |
+
"department": "MARKETING",
|
| 725 |
+
"years_exp": 8,
|
| 726 |
+
"rating": "3.6"
|
| 727 |
+
},
|
| 728 |
+
{
|
| 729 |
+
"age": null,
|
| 730 |
+
"salary": "162400",
|
| 731 |
+
"city": "HYDERABAD",
|
| 732 |
+
"department": "Marketing",
|
| 733 |
+
"years_exp": 10,
|
| 734 |
+
"rating": "3.9"
|
| 735 |
+
},
|
| 736 |
+
{
|
| 737 |
+
"age": 45,
|
| 738 |
+
"salary": "164000",
|
| 739 |
+
"city": "Mumbai",
|
| 740 |
+
"department": "finance",
|
| 741 |
+
"years_exp": 12,
|
| 742 |
+
"rating": null
|
| 743 |
+
},
|
| 744 |
+
{
|
| 745 |
+
"age": 27,
|
| 746 |
+
"salary": "165600",
|
| 747 |
+
"city": "delhi",
|
| 748 |
+
"department": "FINANCE",
|
| 749 |
+
"years_exp": 14,
|
| 750 |
+
"rating": "4.5"
|
| 751 |
+
},
|
| 752 |
+
{
|
| 753 |
+
"age": 36,
|
| 754 |
+
"salary": "47400",
|
| 755 |
+
"city": "chennai",
|
| 756 |
+
"department": "MARKETING",
|
| 757 |
+
"years_exp": 10,
|
| 758 |
+
"rating": "3.6"
|
| 759 |
+
},
|
| 760 |
+
{
|
| 761 |
+
"age": 42,
|
| 762 |
+
"salary": "59800",
|
| 763 |
+
"city": "delhi",
|
| 764 |
+
"department": "ENGINEERING",
|
| 765 |
+
"years_exp": 11,
|
| 766 |
+
"rating": "3.9"
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"age": 24,
|
| 770 |
+
"salary": "70600",
|
| 771 |
+
"city": "Pune",
|
| 772 |
+
"department": "support",
|
| 773 |
+
"years_exp": 10,
|
| 774 |
+
"rating": "3.9"
|
| 775 |
+
},
|
| 776 |
+
{
|
| 777 |
+
"age": 34,
|
| 778 |
+
"salary": "84600",
|
| 779 |
+
"city": "delhi",
|
| 780 |
+
"department": "FINANCE",
|
| 781 |
+
"years_exp": 13,
|
| 782 |
+
"rating": "4.5"
|
| 783 |
+
},
|
| 784 |
+
{
|
| 785 |
+
"age": 31,
|
| 786 |
+
"salary": "101400",
|
| 787 |
+
"city": "BANGALORE",
|
| 788 |
+
"department": "Support",
|
| 789 |
+
"years_exp": 5,
|
| 790 |
+
"rating": "3.6"
|
| 791 |
+
},
|
| 792 |
+
{
|
| 793 |
+
"age": 43,
|
| 794 |
+
"salary": "117000",
|
| 795 |
+
"city": "delhi",
|
| 796 |
+
"department": "FINANCE",
|
| 797 |
+
"years_exp": 10,
|
| 798 |
+
"rating": "4.5"
|
| 799 |
+
},
|
| 800 |
+
{
|
| 801 |
+
"age": 41,
|
| 802 |
+
"salary": "133800",
|
| 803 |
+
"city": "BANGALORE",
|
| 804 |
+
"department": "Support",
|
| 805 |
+
"years_exp": 2,
|
| 806 |
+
"rating": "3.6"
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"age": 32,
|
| 810 |
+
"salary": "149400",
|
| 811 |
+
"city": "delhi",
|
| 812 |
+
"department": "FINANCE",
|
| 813 |
+
"years_exp": 7,
|
| 814 |
+
"rating": "4.5"
|
| 815 |
+
}
|
| 816 |
+
]
|
| 817 |
+
}
|
data/moderate_cleaning.json
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"task_name": "moderate_cleaning",
|
| 3 |
+
"max_steps": 10,
|
| 4 |
+
"expected_dtypes": {
|
| 5 |
+
"age": "int",
|
| 6 |
+
"salary": "int",
|
| 7 |
+
"city": "str",
|
| 8 |
+
"department": "str",
|
| 9 |
+
"years_exp": "int"
|
| 10 |
+
},
|
| 11 |
+
"required_features": [],
|
| 12 |
+
"dataset": [
|
| 13 |
+
{
|
| 14 |
+
"age": 25,
|
| 15 |
+
"salary": "50000",
|
| 16 |
+
"city": "Mumbai",
|
| 17 |
+
"department": "Engineering",
|
| 18 |
+
"years_exp": 3
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"age": null,
|
| 22 |
+
"salary": "62000",
|
| 23 |
+
"city": "Delhi",
|
| 24 |
+
"department": "Sales",
|
| 25 |
+
"years_exp": 7
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"age": 29,
|
| 29 |
+
"salary": "54000",
|
| 30 |
+
"city": "Bangalore",
|
| 31 |
+
"department": "Marketing",
|
| 32 |
+
"years_exp": null
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"age": 41,
|
| 36 |
+
"salary": "not_available",
|
| 37 |
+
"city": "Pune",
|
| 38 |
+
"department": "Finance",
|
| 39 |
+
"years_exp": 14
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"age": 27,
|
| 43 |
+
"salary": "47000",
|
| 44 |
+
"city": "Chennai",
|
| 45 |
+
"department": "Support",
|
| 46 |
+
"years_exp": 4
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"age": 36,
|
| 50 |
+
"salary": "73000",
|
| 51 |
+
"city": "Hyderabad",
|
| 52 |
+
"department": "Operations",
|
| 53 |
+
"years_exp": 10
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"age": 30,
|
| 57 |
+
"salary": "56000",
|
| 58 |
+
"city": "Mumbai",
|
| 59 |
+
"department": "Engineering",
|
| 60 |
+
"years_exp": 6
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"age": null,
|
| 64 |
+
"salary": "68000",
|
| 65 |
+
"city": "Delhi",
|
| 66 |
+
"department": "Sales",
|
| 67 |
+
"years_exp": 9
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"age": 26,
|
| 71 |
+
"salary": "45000",
|
| 72 |
+
"city": "Bangalore",
|
| 73 |
+
"department": "Marketing",
|
| 74 |
+
"years_exp": 3
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"age": 38,
|
| 78 |
+
"salary": "79000",
|
| 79 |
+
"city": "Pune",
|
| 80 |
+
"department": "Finance",
|
| 81 |
+
"years_exp": null
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"age": 31,
|
| 85 |
+
"salary": "not_available",
|
| 86 |
+
"city": "Chennai",
|
| 87 |
+
"department": "Support",
|
| 88 |
+
"years_exp": 8
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"age": 28,
|
| 92 |
+
"salary": "52000",
|
| 93 |
+
"city": "Hyderabad",
|
| 94 |
+
"department": "Operations",
|
| 95 |
+
"years_exp": 5
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"age": 44,
|
| 99 |
+
"salary": "91000",
|
| 100 |
+
"city": "Mumbai",
|
| 101 |
+
"department": "Engineering",
|
| 102 |
+
"years_exp": 17
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"age": 33,
|
| 106 |
+
"salary": "66000",
|
| 107 |
+
"city": "Delhi",
|
| 108 |
+
"department": "Sales",
|
| 109 |
+
"years_exp": 9
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"age": null,
|
| 113 |
+
"salary": "43000",
|
| 114 |
+
"city": "Bangalore",
|
| 115 |
+
"department": "Marketing",
|
| 116 |
+
"years_exp": 2
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"age": 39,
|
| 120 |
+
"salary": "82000",
|
| 121 |
+
"city": "Pune",
|
| 122 |
+
"department": "Finance",
|
| 123 |
+
"years_exp": null
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"age": 35,
|
| 127 |
+
"salary": "71000",
|
| 128 |
+
"city": "Chennai",
|
| 129 |
+
"department": "Support",
|
| 130 |
+
"years_exp": 11
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"age": 29,
|
| 134 |
+
"salary": "55000",
|
| 135 |
+
"city": "Hyderabad",
|
| 136 |
+
"department": "Operations",
|
| 137 |
+
"years_exp": 6
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"age": 42,
|
| 141 |
+
"salary": "not_available",
|
| 142 |
+
"city": "Mumbai",
|
| 143 |
+
"department": "Engineering",
|
| 144 |
+
"years_exp": 16
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"age": 37,
|
| 148 |
+
"salary": "76000",
|
| 149 |
+
"city": "Delhi",
|
| 150 |
+
"department": "Sales",
|
| 151 |
+
"years_exp": 12
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"age": 27,
|
| 155 |
+
"salary": "46000",
|
| 156 |
+
"city": "Bangalore",
|
| 157 |
+
"department": "Marketing",
|
| 158 |
+
"years_exp": 4
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"age": 40,
|
| 162 |
+
"salary": "85000",
|
| 163 |
+
"city": "Pune",
|
| 164 |
+
"department": "Finance",
|
| 165 |
+
"years_exp": 15
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"age": null,
|
| 169 |
+
"salary": "63000",
|
| 170 |
+
"city": "Chennai",
|
| 171 |
+
"department": "Support",
|
| 172 |
+
"years_exp": 7
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"age": 30,
|
| 176 |
+
"salary": "58000",
|
| 177 |
+
"city": "Hyderabad",
|
| 178 |
+
"department": "Operations",
|
| 179 |
+
"years_exp": 5
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"age": 45,
|
| 183 |
+
"salary": "94000",
|
| 184 |
+
"city": "Mumbai",
|
| 185 |
+
"department": "Engineering",
|
| 186 |
+
"years_exp": null
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"age": 34,
|
| 190 |
+
"salary": "69000",
|
| 191 |
+
"city": "Delhi",
|
| 192 |
+
"department": "Sales",
|
| 193 |
+
"years_exp": 10
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"age": 25,
|
| 197 |
+
"salary": "44000",
|
| 198 |
+
"city": "Bangalore",
|
| 199 |
+
"department": "Marketing",
|
| 200 |
+
"years_exp": 3
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"age": 38,
|
| 204 |
+
"salary": "not_available",
|
| 205 |
+
"city": "Pune",
|
| 206 |
+
"department": "Finance",
|
| 207 |
+
"years_exp": 12
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"age": 31,
|
| 211 |
+
"salary": "60000",
|
| 212 |
+
"city": "Chennai",
|
| 213 |
+
"department": "Support",
|
| 214 |
+
"years_exp": 8
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"age": 28,
|
| 218 |
+
"salary": "51000",
|
| 219 |
+
"city": "Hyderabad",
|
| 220 |
+
"department": "Operations",
|
| 221 |
+
"years_exp": 4
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"age": 43,
|
| 225 |
+
"salary": "92000",
|
| 226 |
+
"city": "Mumbai",
|
| 227 |
+
"department": "Engineering",
|
| 228 |
+
"years_exp": 17
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"age": 36,
|
| 232 |
+
"salary": "74000",
|
| 233 |
+
"city": "Delhi",
|
| 234 |
+
"department": "Sales",
|
| 235 |
+
"years_exp": null
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"age": 26,
|
| 239 |
+
"salary": "45500",
|
| 240 |
+
"city": "Bangalore",
|
| 241 |
+
"department": "Marketing",
|
| 242 |
+
"years_exp": 3
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"age": 39,
|
| 246 |
+
"salary": "83000",
|
| 247 |
+
"city": "Pune",
|
| 248 |
+
"department": "Finance",
|
| 249 |
+
"years_exp": 14
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"age": 33,
|
| 253 |
+
"salary": "not_available",
|
| 254 |
+
"city": "Chennai",
|
| 255 |
+
"department": "Support",
|
| 256 |
+
"years_exp": 9
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"age": null,
|
| 260 |
+
"salary": "57000",
|
| 261 |
+
"city": "Hyderabad",
|
| 262 |
+
"department": "Operations",
|
| 263 |
+
"years_exp": 6
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"age": 41,
|
| 267 |
+
"salary": "87000",
|
| 268 |
+
"city": "Mumbai",
|
| 269 |
+
"department": "Engineering",
|
| 270 |
+
"years_exp": 15
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"age": 35,
|
| 274 |
+
"salary": "72000",
|
| 275 |
+
"city": "Delhi",
|
| 276 |
+
"department": "Sales",
|
| 277 |
+
"years_exp": 10
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"age": 24,
|
| 281 |
+
"salary": "42500",
|
| 282 |
+
"city": "Bangalore",
|
| 283 |
+
"department": "Marketing",
|
| 284 |
+
"years_exp": 2
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"age": 37,
|
| 288 |
+
"salary": "77500",
|
| 289 |
+
"city": "Pune",
|
| 290 |
+
"department": "Finance",
|
| 291 |
+
"years_exp": null
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"age": null,
|
| 295 |
+
"salary": "59000",
|
| 296 |
+
"city": "Chennai",
|
| 297 |
+
"department": "Support",
|
| 298 |
+
"years_exp": 6
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"age": 27,
|
| 302 |
+
"salary": "not_available",
|
| 303 |
+
"city": "Hyderabad",
|
| 304 |
+
"department": "Operations",
|
| 305 |
+
"years_exp": 4
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"age": 44,
|
| 309 |
+
"salary": "93000",
|
| 310 |
+
"city": "Mumbai",
|
| 311 |
+
"department": "Engineering",
|
| 312 |
+
"years_exp": 18
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"age": null,
|
| 316 |
+
"salary": "64500",
|
| 317 |
+
"city": "Delhi",
|
| 318 |
+
"department": "Sales",
|
| 319 |
+
"years_exp": 8
|
| 320 |
+
},
|
| 321 |
+
{
|
| 322 |
+
"age": null,
|
| 323 |
+
"salary": "53500",
|
| 324 |
+
"city": "Bangalore",
|
| 325 |
+
"department": "Marketing",
|
| 326 |
+
"years_exp": 5
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
"age": 27,
|
| 330 |
+
"salary": "47000",
|
| 331 |
+
"city": "Chennai",
|
| 332 |
+
"department": "Support",
|
| 333 |
+
"years_exp": 4
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"age": 28,
|
| 337 |
+
"salary": "52000",
|
| 338 |
+
"city": "Hyderabad",
|
| 339 |
+
"department": "Operations",
|
| 340 |
+
"years_exp": 5
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"age": 37,
|
| 344 |
+
"salary": "76000",
|
| 345 |
+
"city": "Delhi",
|
| 346 |
+
"department": "Sales",
|
| 347 |
+
"years_exp": 12
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"age": 31,
|
| 351 |
+
"salary": "60000",
|
| 352 |
+
"city": "Chennai",
|
| 353 |
+
"department": "Support",
|
| 354 |
+
"years_exp": 8
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"age": 35,
|
| 358 |
+
"salary": "72000",
|
| 359 |
+
"city": "Delhi",
|
| 360 |
+
"department": "Sales",
|
| 361 |
+
"years_exp": 10
|
| 362 |
+
}
|
| 363 |
+
]
|
| 364 |
+
}
|
env/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .environment import DataCleaningEnv
|
| 2 |
+
from .graders import DataCleaningGrader
|
| 3 |
+
|
| 4 |
+
__all__ = ["DataCleaningEnv", "DataCleaningGrader"]
|
env/actions.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from env.models import Action, ColumnInfo, Issue
|
| 6 |
+
|
| 7 |
+
ALLOWED_ACTIONS = {
|
| 8 |
+
"fill_missing",
|
| 9 |
+
"drop_duplicates",
|
| 10 |
+
"convert_dtype",
|
| 11 |
+
"normalize_category",
|
| 12 |
+
"create_feature",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
VALID_FILL_STRATEGIES = {
|
| 16 |
+
"numeric": ["mean", "median", "zero"],
|
| 17 |
+
"categorical": ["mode", "unknown"],
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
VALID_TARGET_DTYPES = {"int", "float", "str", "bool"}
|
| 21 |
+
|
| 22 |
+
FEATURE_REGISTRY = {
|
| 23 |
+
"age_group": {
|
| 24 |
+
"source": "age",
|
| 25 |
+
"transform": "bin",
|
| 26 |
+
"bins": [0, 18, 35, 50, 100],
|
| 27 |
+
"labels": ["young", "adult", "middle", "senior"],
|
| 28 |
+
},
|
| 29 |
+
"salary_bracket": {
|
| 30 |
+
"source": "salary",
|
| 31 |
+
"transform": "bin",
|
| 32 |
+
"bins": [0, 25000, 50000, 100000, float("inf")],
|
| 33 |
+
"labels": ["low", "medium", "high", "very_high"],
|
| 34 |
+
},
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
MISSING_SENTINELS = {None, "", "not_available"}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def is_missing(value: Any) -> bool:
|
| 41 |
+
return value in MISSING_SENTINELS
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def infer_column_family(expected_dtype: str) -> str:
|
| 45 |
+
return "numeric" if expected_dtype in {"int", "float"} else "categorical"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def has_duplicates(dataset: list[dict[str, Any]]) -> bool:
|
| 49 |
+
seen: set[tuple[tuple[str, Any], ...]] = set()
|
| 50 |
+
for row in dataset:
|
| 51 |
+
key = tuple(sorted(row.items()))
|
| 52 |
+
if key in seen:
|
| 53 |
+
return True
|
| 54 |
+
seen.add(key)
|
| 55 |
+
return False
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _get_column_info(column_infos: list[ColumnInfo], column: str) -> ColumnInfo | None:
|
| 59 |
+
for info in column_infos:
|
| 60 |
+
if info.name == column:
|
| 61 |
+
return info
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _non_missing_values(dataset: list[dict[str, Any]], column: str) -> list[Any]:
|
| 66 |
+
return [row.get(column) for row in dataset if not is_missing(row.get(column))]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _is_convertible(value: Any, target_dtype: str) -> bool:
|
| 70 |
+
if is_missing(value):
|
| 71 |
+
return True
|
| 72 |
+
try:
|
| 73 |
+
if target_dtype == "int":
|
| 74 |
+
if isinstance(value, bool):
|
| 75 |
+
return True
|
| 76 |
+
if isinstance(value, str) and value.strip() == "":
|
| 77 |
+
return False
|
| 78 |
+
int(str(value))
|
| 79 |
+
return True
|
| 80 |
+
if target_dtype == "float":
|
| 81 |
+
float(str(value))
|
| 82 |
+
return True
|
| 83 |
+
if target_dtype == "bool":
|
| 84 |
+
normalized = str(value).strip().lower()
|
| 85 |
+
return normalized in {"true", "false", "1", "0", "yes", "no"}
|
| 86 |
+
if target_dtype == "str":
|
| 87 |
+
str(value)
|
| 88 |
+
return True
|
| 89 |
+
except (TypeError, ValueError):
|
| 90 |
+
return False
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def validate_action(
|
| 95 |
+
dataset: list[dict[str, Any]],
|
| 96 |
+
pending_issues: list[Issue],
|
| 97 |
+
column_infos: list[ColumnInfo],
|
| 98 |
+
expected_dtypes: dict[str, str],
|
| 99 |
+
action: Action,
|
| 100 |
+
resolved_issues: list[Issue],
|
| 101 |
+
) -> tuple[bool, str, Issue | None, bool]:
|
| 102 |
+
if action.action_type not in ALLOWED_ACTIONS:
|
| 103 |
+
return False, f"Unsupported action_type '{action.action_type}'", None, False
|
| 104 |
+
|
| 105 |
+
issue_lookup = {(issue.issue_type, issue.column): issue for issue in pending_issues}
|
| 106 |
+
column_info = _get_column_info(column_infos, action.column) if action.column != "__all__" else None
|
| 107 |
+
resolved_ids = {issue.issue_id for issue in resolved_issues}
|
| 108 |
+
|
| 109 |
+
matched_issue: Issue | None = None
|
| 110 |
+
if action.action_type == "fill_missing":
|
| 111 |
+
matched_issue = issue_lookup.get(("missing", action.column))
|
| 112 |
+
if matched_issue is None:
|
| 113 |
+
return False, f"Column '{action.column}' does not have a pending missing-value issue", None, False
|
| 114 |
+
if column_info is None:
|
| 115 |
+
return False, f"Unknown column '{action.column}'", None, False
|
| 116 |
+
expected_dtype = expected_dtypes.get(action.column, column_info.dtype)
|
| 117 |
+
family = infer_column_family(expected_dtype)
|
| 118 |
+
strategy = action.params.get("strategy")
|
| 119 |
+
if strategy not in VALID_FILL_STRATEGIES[family]:
|
| 120 |
+
return False, f"Invalid fill strategy '{strategy}' for {family} column", None, False
|
| 121 |
+
if not any(is_missing(row.get(action.column)) for row in dataset):
|
| 122 |
+
return False, f"Column '{action.column}' has no missing values", None, False
|
| 123 |
+
elif action.action_type == "drop_duplicates":
|
| 124 |
+
matched_issue = issue_lookup.get(("duplicate", "__all__"))
|
| 125 |
+
if action.column != "__all__":
|
| 126 |
+
return False, "drop_duplicates must target column '__all__'", None, False
|
| 127 |
+
if action.params:
|
| 128 |
+
return False, "drop_duplicates does not accept params", None, False
|
| 129 |
+
if matched_issue is None or not has_duplicates(dataset):
|
| 130 |
+
return False, "Dataset does not have duplicate rows", None, False
|
| 131 |
+
elif action.action_type == "convert_dtype":
|
| 132 |
+
matched_issue = issue_lookup.get(("wrong_dtype", action.column))
|
| 133 |
+
if matched_issue is None:
|
| 134 |
+
return False, f"Column '{action.column}' does not have a pending wrong_dtype issue", None, False
|
| 135 |
+
target_dtype = action.params.get("target_dtype")
|
| 136 |
+
if target_dtype not in VALID_TARGET_DTYPES:
|
| 137 |
+
return False, f"Invalid target dtype '{target_dtype}'", None, False
|
| 138 |
+
if target_dtype != expected_dtypes.get(action.column):
|
| 139 |
+
return False, f"Target dtype for '{action.column}' must be '{expected_dtypes.get(action.column)}'", None, False
|
| 140 |
+
values = _non_missing_values(dataset, action.column)
|
| 141 |
+
if any(not _is_convertible(value, target_dtype) for value in values):
|
| 142 |
+
return False, f"Column '{action.column}' contains non-convertible values", None, False
|
| 143 |
+
if any(str(value).strip().lower() == "not_available" for value in values):
|
| 144 |
+
return False, f"Column '{action.column}' still contains not_available placeholders", None, False
|
| 145 |
+
elif action.action_type == "normalize_category":
|
| 146 |
+
matched_issue = issue_lookup.get(("inconsistent_category", action.column))
|
| 147 |
+
if matched_issue is None:
|
| 148 |
+
return False, f"Column '{action.column}' does not have a pending inconsistent_category issue", None, False
|
| 149 |
+
if action.params:
|
| 150 |
+
return False, "normalize_category does not accept params", None, False
|
| 151 |
+
values = [row.get(action.column) for row in dataset if not is_missing(row.get(action.column))]
|
| 152 |
+
lowered = [str(value).lower() for value in values]
|
| 153 |
+
if len(lowered) == len(set(lowered)):
|
| 154 |
+
return False, f"Column '{action.column}' has no categorical inconsistencies", None, False
|
| 155 |
+
elif action.action_type == "create_feature":
|
| 156 |
+
matched_issue = issue_lookup.get(("missing_feature", action.column))
|
| 157 |
+
feature_name = action.params.get("feature_name")
|
| 158 |
+
if matched_issue is None:
|
| 159 |
+
return False, f"Column '{action.column}' does not have a pending missing_feature issue", None, False
|
| 160 |
+
if feature_name not in FEATURE_REGISTRY:
|
| 161 |
+
return False, f"Unknown feature '{feature_name}'", None, False
|
| 162 |
+
if action.column != feature_name:
|
| 163 |
+
return False, f"create_feature column must match feature name '{feature_name}'", None, False
|
| 164 |
+
source_column = FEATURE_REGISTRY[feature_name]["source"]
|
| 165 |
+
if source_column not in dataset[0]:
|
| 166 |
+
return False, f"Source column '{source_column}' is missing", None, False
|
| 167 |
+
source_dtype = expected_dtypes.get(source_column)
|
| 168 |
+
if source_dtype not in {"int", "float"}:
|
| 169 |
+
return False, f"Source column '{source_column}' must be numeric", None, False
|
| 170 |
+
source_values = _non_missing_values(dataset, source_column)
|
| 171 |
+
if any(not _is_convertible(value, source_dtype) for value in source_values):
|
| 172 |
+
return False, f"Source column '{source_column}' is not clean enough to create the feature", None, False
|
| 173 |
+
|
| 174 |
+
dependency_ok = True
|
| 175 |
+
if matched_issue and matched_issue.depends_on:
|
| 176 |
+
dependency_ok = all(dep_id in resolved_ids for dep_id in matched_issue.depends_on)
|
| 177 |
+
if not dependency_ok:
|
| 178 |
+
return False, f"Dependencies for issue '{matched_issue.issue_id}' are not resolved", matched_issue, False
|
| 179 |
+
|
| 180 |
+
return True, "", matched_issue, dependency_ok
|
env/environment.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from statistics import median
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from env.actions import FEATURE_REGISTRY, is_missing, validate_action
|
| 10 |
+
from env.models import Action, ColumnInfo, Issue, Observation
|
| 11 |
+
from env.quality import compute_quality_score
|
| 12 |
+
from env.rewards import compute_reward
|
| 13 |
+
|
| 14 |
+
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DataCleaningEnv:
|
| 18 |
+
def __init__(self, task_name: str = "basic_cleaning"):
|
| 19 |
+
self.task_name = task_name
|
| 20 |
+
self.task_config: dict[str, Any] = {}
|
| 21 |
+
self.dataset: list[dict[str, Any]] = []
|
| 22 |
+
self.original_dataset: list[dict[str, Any]] = []
|
| 23 |
+
self.issues: list[Issue] = []
|
| 24 |
+
self.pending_issues: list[Issue] = []
|
| 25 |
+
self.resolved_issues: list[Issue] = []
|
| 26 |
+
self.action_history: list[dict[str, Any]] = []
|
| 27 |
+
self.steps_remaining = 0
|
| 28 |
+
self.max_steps = 0
|
| 29 |
+
self.total_issues_at_start = 0
|
| 30 |
+
self.quality_score = 0.0
|
| 31 |
+
self.expected_dtypes: dict[str, str] = {}
|
| 32 |
+
self.required_features: list[str] = []
|
| 33 |
+
self._issue_id_map: dict[tuple[str, str], str] = {}
|
| 34 |
+
|
| 35 |
+
def reset(self) -> Observation:
|
| 36 |
+
config_path = DATA_DIR / f"{self.task_name}.json"
|
| 37 |
+
with config_path.open("r", encoding="utf-8") as handle:
|
| 38 |
+
self.task_config = json.load(handle)
|
| 39 |
+
|
| 40 |
+
self.dataset = copy.deepcopy(self.task_config["dataset"])
|
| 41 |
+
self.original_dataset = copy.deepcopy(self.dataset)
|
| 42 |
+
self.expected_dtypes = dict(self.task_config["expected_dtypes"])
|
| 43 |
+
self.required_features = list(self.task_config.get("required_features", []))
|
| 44 |
+
self.action_history = []
|
| 45 |
+
self.resolved_issues = []
|
| 46 |
+
self.max_steps = int(self.task_config["max_steps"])
|
| 47 |
+
self.steps_remaining = self.max_steps
|
| 48 |
+
|
| 49 |
+
self._issue_id_map = {}
|
| 50 |
+
detected = self._detect_issues(self.dataset)
|
| 51 |
+
self.pending_issues = detected
|
| 52 |
+
self.issues = list(detected)
|
| 53 |
+
self.total_issues_at_start = len(detected)
|
| 54 |
+
self.quality_score = compute_quality_score(
|
| 55 |
+
self.dataset,
|
| 56 |
+
self._build_column_infos(),
|
| 57 |
+
self.total_issues_at_start,
|
| 58 |
+
)
|
| 59 |
+
return self.state()
|
| 60 |
+
|
| 61 |
+
def step(self, action: Action) -> tuple[Observation, float, bool, dict]:
|
| 62 |
+
if not self.dataset:
|
| 63 |
+
self.reset()
|
| 64 |
+
|
| 65 |
+
self.steps_remaining -= 1
|
| 66 |
+
old_quality = self.quality_score
|
| 67 |
+
columns = self._build_column_infos()
|
| 68 |
+
action_valid, message, matched_issue, dependency_ok = validate_action(
|
| 69 |
+
self.dataset,
|
| 70 |
+
self.pending_issues,
|
| 71 |
+
columns,
|
| 72 |
+
self.expected_dtypes,
|
| 73 |
+
action,
|
| 74 |
+
self.resolved_issues,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
info: dict[str, Any] = {}
|
| 78 |
+
if not action_valid:
|
| 79 |
+
reward = compute_reward(old_quality, old_quality, False, False)
|
| 80 |
+
info = {"error": "invalid_action", "message": message}
|
| 81 |
+
self.action_history.append(
|
| 82 |
+
{
|
| 83 |
+
"action_type": action.action_type,
|
| 84 |
+
"column": action.column,
|
| 85 |
+
"params": action.params,
|
| 86 |
+
"reward": reward,
|
| 87 |
+
"error": message,
|
| 88 |
+
}
|
| 89 |
+
)
|
| 90 |
+
observation = self.state()
|
| 91 |
+
done = self.steps_remaining <= 0 or len(self.pending_issues) == 0
|
| 92 |
+
return observation, reward, done, info
|
| 93 |
+
|
| 94 |
+
self._apply_action(action)
|
| 95 |
+
redetected = self._detect_issues(self.dataset)
|
| 96 |
+
self.pending_issues = redetected
|
| 97 |
+
self.issues = list(redetected)
|
| 98 |
+
|
| 99 |
+
if matched_issue and not self._issue_present(redetected, matched_issue.issue_type, matched_issue.column):
|
| 100 |
+
self.resolved_issues.append(matched_issue)
|
| 101 |
+
|
| 102 |
+
self.quality_score = compute_quality_score(
|
| 103 |
+
self.dataset,
|
| 104 |
+
self._build_column_infos(),
|
| 105 |
+
self.total_issues_at_start,
|
| 106 |
+
)
|
| 107 |
+
reward = compute_reward(old_quality, self.quality_score, True, dependency_ok)
|
| 108 |
+
self.action_history.append(
|
| 109 |
+
{
|
| 110 |
+
"action_type": action.action_type,
|
| 111 |
+
"column": action.column,
|
| 112 |
+
"params": action.params,
|
| 113 |
+
"reward": reward,
|
| 114 |
+
"error": None,
|
| 115 |
+
}
|
| 116 |
+
)
|
| 117 |
+
observation = self.state()
|
| 118 |
+
done = self.steps_remaining <= 0 or len(self.pending_issues) == 0
|
| 119 |
+
return observation, reward, done, info
|
| 120 |
+
|
| 121 |
+
def state(self) -> Observation:
|
| 122 |
+
return Observation(
|
| 123 |
+
data_preview=copy.deepcopy(self.dataset[:5]),
|
| 124 |
+
columns=self._build_column_infos(),
|
| 125 |
+
pending_issues=copy.deepcopy(self.pending_issues),
|
| 126 |
+
resolved_issues=copy.deepcopy(self.resolved_issues),
|
| 127 |
+
action_history=copy.deepcopy(self.action_history),
|
| 128 |
+
quality_score=self.quality_score,
|
| 129 |
+
steps_remaining=self.steps_remaining,
|
| 130 |
+
total_rows=len(self.dataset),
|
| 131 |
+
total_issues_at_start=self.total_issues_at_start,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
def _detect_issues(self, dataset: list[dict[str, Any]]) -> list[Issue]:
|
| 135 |
+
if not dataset:
|
| 136 |
+
return []
|
| 137 |
+
|
| 138 |
+
raw_issues: list[dict[str, Any]] = []
|
| 139 |
+
columns = list(self.expected_dtypes.keys())
|
| 140 |
+
|
| 141 |
+
for column in columns:
|
| 142 |
+
missing_count = sum(1 for row in dataset if is_missing(row.get(column)))
|
| 143 |
+
if missing_count:
|
| 144 |
+
raw_issues.append(
|
| 145 |
+
{
|
| 146 |
+
"issue_type": "missing",
|
| 147 |
+
"column": column,
|
| 148 |
+
"description": f"Column '{column}' has {missing_count} missing values that should be filled.",
|
| 149 |
+
}
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if self._has_duplicates(dataset):
|
| 153 |
+
raw_issues.append(
|
| 154 |
+
{
|
| 155 |
+
"issue_type": "duplicate",
|
| 156 |
+
"column": "__all__",
|
| 157 |
+
"description": "Dataset contains duplicate rows that should be removed.",
|
| 158 |
+
}
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
for column in columns:
|
| 162 |
+
expected_dtype = self.expected_dtypes[column]
|
| 163 |
+
actual_dtype = self._infer_runtime_dtype(dataset, column)
|
| 164 |
+
if expected_dtype in {"int", "float", "bool"} and actual_dtype != expected_dtype:
|
| 165 |
+
raw_issues.append(
|
| 166 |
+
{
|
| 167 |
+
"issue_type": "wrong_dtype",
|
| 168 |
+
"column": column,
|
| 169 |
+
"description": (
|
| 170 |
+
f"Column '{column}' should be '{expected_dtype}' but is currently represented as '{actual_dtype}'."
|
| 171 |
+
),
|
| 172 |
+
}
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
for column in columns:
|
| 176 |
+
if self.expected_dtypes[column] != "str":
|
| 177 |
+
continue
|
| 178 |
+
if self._has_inconsistent_categories(dataset, column):
|
| 179 |
+
raw_issues.append(
|
| 180 |
+
{
|
| 181 |
+
"issue_type": "inconsistent_category",
|
| 182 |
+
"column": column,
|
| 183 |
+
"description": f"Column '{column}' has inconsistent categorical values that differ only by casing.",
|
| 184 |
+
}
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
for feature_name in self.required_features:
|
| 188 |
+
if not all(feature_name in row for row in dataset):
|
| 189 |
+
raw_issues.append(
|
| 190 |
+
{
|
| 191 |
+
"issue_type": "missing_feature",
|
| 192 |
+
"column": feature_name,
|
| 193 |
+
"description": f"Required feature '{feature_name}' has not been created yet.",
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
for raw_issue in raw_issues:
|
| 198 |
+
signature = (raw_issue["issue_type"], raw_issue["column"])
|
| 199 |
+
if signature not in self._issue_id_map:
|
| 200 |
+
self._issue_id_map[signature] = f"issue_{len(self._issue_id_map) + 1:03d}"
|
| 201 |
+
|
| 202 |
+
issues: list[Issue] = []
|
| 203 |
+
signature_to_id = {signature: issue_id for signature, issue_id in self._issue_id_map.items()}
|
| 204 |
+
|
| 205 |
+
for raw_issue in raw_issues:
|
| 206 |
+
signature = (raw_issue["issue_type"], raw_issue["column"])
|
| 207 |
+
depends_on: list[str] = []
|
| 208 |
+
|
| 209 |
+
if raw_issue["issue_type"] == "wrong_dtype" and raw_issue["column"] in {"salary", "rating"}:
|
| 210 |
+
missing_signature = ("missing", raw_issue["column"])
|
| 211 |
+
if missing_signature in signature_to_id:
|
| 212 |
+
depends_on.append(signature_to_id[missing_signature])
|
| 213 |
+
|
| 214 |
+
if raw_issue["issue_type"] == "missing_feature":
|
| 215 |
+
feature_name = raw_issue["column"]
|
| 216 |
+
source_column = FEATURE_REGISTRY[feature_name]["source"]
|
| 217 |
+
for dependency_type in ("missing", "wrong_dtype"):
|
| 218 |
+
source_signature = (dependency_type, source_column)
|
| 219 |
+
if source_signature in signature_to_id:
|
| 220 |
+
depends_on.append(signature_to_id[source_signature])
|
| 221 |
+
|
| 222 |
+
issues.append(
|
| 223 |
+
Issue(
|
| 224 |
+
issue_id=signature_to_id[signature],
|
| 225 |
+
issue_type=raw_issue["issue_type"],
|
| 226 |
+
column=raw_issue["column"],
|
| 227 |
+
description=raw_issue["description"],
|
| 228 |
+
depends_on=depends_on,
|
| 229 |
+
)
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
return issues
|
| 233 |
+
|
| 234 |
+
def _build_column_infos(self) -> list[ColumnInfo]:
|
| 235 |
+
if not self.dataset:
|
| 236 |
+
return []
|
| 237 |
+
|
| 238 |
+
infos: list[ColumnInfo] = []
|
| 239 |
+
for column in self.dataset[0].keys():
|
| 240 |
+
values = [row.get(column) for row in self.dataset]
|
| 241 |
+
non_missing = [value for value in values if not is_missing(value)]
|
| 242 |
+
infos.append(
|
| 243 |
+
ColumnInfo(
|
| 244 |
+
name=column,
|
| 245 |
+
dtype=self._infer_runtime_dtype(self.dataset, column),
|
| 246 |
+
null_count=sum(1 for value in values if is_missing(value)),
|
| 247 |
+
unique_count=len({str(value) for value in non_missing}),
|
| 248 |
+
)
|
| 249 |
+
)
|
| 250 |
+
return infos
|
| 251 |
+
|
| 252 |
+
def _infer_runtime_dtype(self, dataset: list[dict[str, Any]], column: str) -> str:
|
| 253 |
+
values = [row.get(column) for row in dataset if not is_missing(row.get(column))]
|
| 254 |
+
if not values:
|
| 255 |
+
return self.expected_dtypes.get(column, "str")
|
| 256 |
+
if all(isinstance(value, bool) for value in values):
|
| 257 |
+
return "bool"
|
| 258 |
+
if all(isinstance(value, int) and not isinstance(value, bool) for value in values):
|
| 259 |
+
return "int"
|
| 260 |
+
if all(isinstance(value, (int, float)) and not isinstance(value, bool) for value in values):
|
| 261 |
+
return "float"
|
| 262 |
+
return "str"
|
| 263 |
+
|
| 264 |
+
def _has_duplicates(self, dataset: list[dict[str, Any]]) -> bool:
|
| 265 |
+
seen: set[tuple[tuple[str, Any], ...]] = set()
|
| 266 |
+
for row in dataset:
|
| 267 |
+
key = tuple(sorted(row.items()))
|
| 268 |
+
if key in seen:
|
| 269 |
+
return True
|
| 270 |
+
seen.add(key)
|
| 271 |
+
return False
|
| 272 |
+
|
| 273 |
+
def _has_inconsistent_categories(self, dataset: list[dict[str, Any]], column: str) -> bool:
|
| 274 |
+
groups: dict[str, set[str]] = {}
|
| 275 |
+
for row in dataset:
|
| 276 |
+
value = row.get(column)
|
| 277 |
+
if is_missing(value):
|
| 278 |
+
continue
|
| 279 |
+
normalized = str(value).lower()
|
| 280 |
+
groups.setdefault(normalized, set()).add(str(value))
|
| 281 |
+
return any(len(forms) > 1 for forms in groups.values())
|
| 282 |
+
|
| 283 |
+
def _issue_present(self, issues: list[Issue], issue_type: str, column: str) -> bool:
|
| 284 |
+
return any(issue.issue_type == issue_type and issue.column == column for issue in issues)
|
| 285 |
+
|
| 286 |
+
def _apply_action(self, action: Action) -> None:
|
| 287 |
+
if action.action_type == "fill_missing":
|
| 288 |
+
self._apply_fill_missing(action.column, action.params["strategy"])
|
| 289 |
+
elif action.action_type == "drop_duplicates":
|
| 290 |
+
unique_rows: list[dict[str, Any]] = []
|
| 291 |
+
seen: set[tuple[tuple[str, Any], ...]] = set()
|
| 292 |
+
for row in self.dataset:
|
| 293 |
+
key = tuple(sorted(row.items()))
|
| 294 |
+
if key in seen:
|
| 295 |
+
continue
|
| 296 |
+
seen.add(key)
|
| 297 |
+
unique_rows.append(row)
|
| 298 |
+
self.dataset = unique_rows
|
| 299 |
+
elif action.action_type == "convert_dtype":
|
| 300 |
+
target_dtype = action.params["target_dtype"]
|
| 301 |
+
for row in self.dataset:
|
| 302 |
+
value = row.get(action.column)
|
| 303 |
+
if is_missing(value):
|
| 304 |
+
row[action.column] = None
|
| 305 |
+
else:
|
| 306 |
+
row[action.column] = self._convert_value(value, target_dtype)
|
| 307 |
+
elif action.action_type == "normalize_category":
|
| 308 |
+
self._apply_normalize_category(action.column)
|
| 309 |
+
elif action.action_type == "create_feature":
|
| 310 |
+
self._apply_create_feature(action.params["feature_name"])
|
| 311 |
+
|
| 312 |
+
def _apply_fill_missing(self, column: str, strategy: str) -> None:
|
| 313 |
+
expected_dtype = self.expected_dtypes.get(column, "str")
|
| 314 |
+
valid_values = [row.get(column) for row in self.dataset if not is_missing(row.get(column))]
|
| 315 |
+
|
| 316 |
+
if expected_dtype in {"int", "float"}:
|
| 317 |
+
numeric_values = [self._convert_value(value, expected_dtype) for value in valid_values]
|
| 318 |
+
if strategy == "mean":
|
| 319 |
+
fill_value = sum(numeric_values) / len(numeric_values)
|
| 320 |
+
elif strategy == "median":
|
| 321 |
+
fill_value = median(numeric_values)
|
| 322 |
+
else:
|
| 323 |
+
fill_value = 0
|
| 324 |
+
if expected_dtype == "int":
|
| 325 |
+
fill_value = int(round(fill_value))
|
| 326 |
+
else:
|
| 327 |
+
if strategy == "mode":
|
| 328 |
+
fill_value = self._pick_mode([str(value) for value in valid_values])
|
| 329 |
+
else:
|
| 330 |
+
fill_value = "unknown"
|
| 331 |
+
|
| 332 |
+
for row in self.dataset:
|
| 333 |
+
if is_missing(row.get(column)):
|
| 334 |
+
row[column] = fill_value
|
| 335 |
+
|
| 336 |
+
def _apply_normalize_category(self, column: str) -> None:
|
| 337 |
+
groups: dict[str, dict[str, int]] = {}
|
| 338 |
+
for row in self.dataset:
|
| 339 |
+
value = row.get(column)
|
| 340 |
+
if is_missing(value):
|
| 341 |
+
continue
|
| 342 |
+
surface = str(value)
|
| 343 |
+
groups.setdefault(surface.lower(), {})
|
| 344 |
+
groups[surface.lower()][surface] = groups[surface.lower()].get(surface, 0) + 1
|
| 345 |
+
|
| 346 |
+
canonical: dict[str, str] = {}
|
| 347 |
+
for lowered, counts in groups.items():
|
| 348 |
+
canonical[lowered] = min(
|
| 349 |
+
counts.items(),
|
| 350 |
+
key=lambda item: (-item[1], item[0].lower(), 0 if item[0].islower() else 1, item[0]),
|
| 351 |
+
)[0]
|
| 352 |
+
|
| 353 |
+
for row in self.dataset:
|
| 354 |
+
value = row.get(column)
|
| 355 |
+
if is_missing(value):
|
| 356 |
+
continue
|
| 357 |
+
row[column] = canonical[str(value).lower()]
|
| 358 |
+
|
| 359 |
+
def _apply_create_feature(self, feature_name: str) -> None:
|
| 360 |
+
feature_config = FEATURE_REGISTRY[feature_name]
|
| 361 |
+
source = feature_config["source"]
|
| 362 |
+
bins = feature_config["bins"]
|
| 363 |
+
labels = feature_config["labels"]
|
| 364 |
+
|
| 365 |
+
for row in self.dataset:
|
| 366 |
+
source_value = row.get(source)
|
| 367 |
+
if is_missing(source_value):
|
| 368 |
+
row[feature_name] = None
|
| 369 |
+
continue
|
| 370 |
+
|
| 371 |
+
numeric_value = float(source_value)
|
| 372 |
+
assigned = None
|
| 373 |
+
for index, label in enumerate(labels):
|
| 374 |
+
lower = bins[index]
|
| 375 |
+
upper = bins[index + 1]
|
| 376 |
+
is_last = index == len(labels) - 1
|
| 377 |
+
if (lower <= numeric_value < upper) or (is_last and lower <= numeric_value <= upper):
|
| 378 |
+
assigned = label
|
| 379 |
+
break
|
| 380 |
+
row[feature_name] = assigned
|
| 381 |
+
|
| 382 |
+
def _pick_mode(self, values: list[str]) -> str:
|
| 383 |
+
counts: dict[str, int] = {}
|
| 384 |
+
for value in values:
|
| 385 |
+
counts[value] = counts.get(value, 0) + 1
|
| 386 |
+
return min(
|
| 387 |
+
counts.items(),
|
| 388 |
+
key=lambda item: (-item[1], item[0].lower(), 0 if item[0].islower() else 1, item[0]),
|
| 389 |
+
)[0]
|
| 390 |
+
|
| 391 |
+
def _convert_value(self, value: Any, target_dtype: str) -> Any:
|
| 392 |
+
if target_dtype == "int":
|
| 393 |
+
return int(float(str(value)))
|
| 394 |
+
if target_dtype == "float":
|
| 395 |
+
return float(str(value))
|
| 396 |
+
if target_dtype == "bool":
|
| 397 |
+
normalized = str(value).strip().lower()
|
| 398 |
+
return normalized in {"true", "1", "yes"}
|
| 399 |
+
return str(value)
|
env/graders.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class DataCleaningGrader:
|
| 2 |
+
def grade(self, final_state: dict, task_config: dict) -> float:
|
| 3 |
+
issues_fixed = len(final_state["resolved_issues"])
|
| 4 |
+
total_issues = task_config["total_issues"]
|
| 5 |
+
steps_taken = task_config["max_steps"] - final_state["steps_remaining"]
|
| 6 |
+
wrong_actions = sum(1 for action in final_state["action_history"] if action.get("error"))
|
| 7 |
+
|
| 8 |
+
correctness = issues_fixed / total_issues if total_issues > 0 else 1.0
|
| 9 |
+
efficiency = max(0, 1 - steps_taken / (2 * total_issues)) if total_issues > 0 else 1.0
|
| 10 |
+
penalty = wrong_actions * 0.05
|
| 11 |
+
|
| 12 |
+
score = 0.8 * correctness + 0.2 * efficiency - penalty
|
| 13 |
+
return round(max(0.0, min(1.0, score)), 2)
|
env/models.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ColumnInfo(BaseModel):
|
| 7 |
+
name: str
|
| 8 |
+
dtype: str
|
| 9 |
+
null_count: int
|
| 10 |
+
unique_count: int
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Issue(BaseModel):
|
| 14 |
+
issue_id: str
|
| 15 |
+
issue_type: str
|
| 16 |
+
column: str
|
| 17 |
+
description: str
|
| 18 |
+
depends_on: list[str] = Field(default_factory=list)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Observation(BaseModel):
|
| 22 |
+
data_preview: list[dict[str, Any]]
|
| 23 |
+
columns: list[ColumnInfo]
|
| 24 |
+
pending_issues: list[Issue]
|
| 25 |
+
resolved_issues: list[Issue]
|
| 26 |
+
action_history: list[dict[str, Any]]
|
| 27 |
+
quality_score: float
|
| 28 |
+
steps_remaining: int
|
| 29 |
+
total_rows: int
|
| 30 |
+
total_issues_at_start: int
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Action(BaseModel):
|
| 34 |
+
action_type: str
|
| 35 |
+
column: str
|
| 36 |
+
params: dict[str, str] = Field(default_factory=dict)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class Reward(BaseModel):
|
| 40 |
+
value: float
|
env/quality.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from env.actions import is_missing
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _is_numeric_value(value: Any, dtype: str) -> bool:
|
| 9 |
+
if is_missing(value):
|
| 10 |
+
return False
|
| 11 |
+
try:
|
| 12 |
+
if dtype == "int":
|
| 13 |
+
int(str(value))
|
| 14 |
+
elif dtype == "float":
|
| 15 |
+
float(str(value))
|
| 16 |
+
else:
|
| 17 |
+
return False
|
| 18 |
+
return True
|
| 19 |
+
except (TypeError, ValueError):
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _compute_consistency(dataset: list[dict], column_infos: list) -> float:
|
| 24 |
+
if not dataset or not column_infos:
|
| 25 |
+
return 1.0
|
| 26 |
+
|
| 27 |
+
valid_checks = 0
|
| 28 |
+
total_checks = 0
|
| 29 |
+
|
| 30 |
+
for info in column_infos:
|
| 31 |
+
values = [row.get(info.name) for row in dataset]
|
| 32 |
+
if info.dtype in {"int", "float"}:
|
| 33 |
+
for value in values:
|
| 34 |
+
total_checks += 1
|
| 35 |
+
if _is_numeric_value(value, info.dtype):
|
| 36 |
+
valid_checks += 1
|
| 37 |
+
else:
|
| 38 |
+
non_missing = [str(value) for value in values if not is_missing(value)]
|
| 39 |
+
if not non_missing:
|
| 40 |
+
continue
|
| 41 |
+
lowered = {}
|
| 42 |
+
for value in non_missing:
|
| 43 |
+
lowered.setdefault(value.lower(), set()).add(value)
|
| 44 |
+
has_inconsistency = any(len(forms) > 1 for forms in lowered.values())
|
| 45 |
+
total_checks += 1
|
| 46 |
+
if not has_inconsistency:
|
| 47 |
+
valid_checks += 1
|
| 48 |
+
|
| 49 |
+
return valid_checks / total_checks if total_checks else 1.0
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def compute_quality_score(dataset: list[dict], column_infos: list, original_issues_count: int) -> float:
|
| 53 |
+
if original_issues_count == 0:
|
| 54 |
+
return 1.0
|
| 55 |
+
|
| 56 |
+
total_cells = len(dataset) * len(dataset[0]) if dataset else 1
|
| 57 |
+
missing_cells = sum(
|
| 58 |
+
1 for row in dataset for value in row.values() if value is None or value == "" or value == "not_available"
|
| 59 |
+
)
|
| 60 |
+
completeness = 1.0 - (missing_cells / total_cells)
|
| 61 |
+
|
| 62 |
+
total_rows = len(dataset)
|
| 63 |
+
unique_rows = len(set(str(sorted(row.items())) for row in dataset))
|
| 64 |
+
uniqueness = unique_rows / total_rows if total_rows > 0 else 1.0
|
| 65 |
+
|
| 66 |
+
consistency = _compute_consistency(dataset, column_infos)
|
| 67 |
+
|
| 68 |
+
return round(0.4 * completeness + 0.3 * uniqueness + 0.3 * consistency, 4)
|
env/rewards.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def compute_reward(
|
| 2 |
+
old_quality: float,
|
| 3 |
+
new_quality: float,
|
| 4 |
+
action_valid: bool,
|
| 5 |
+
resolved_dependency_correctly: bool,
|
| 6 |
+
) -> float:
|
| 7 |
+
if not action_valid:
|
| 8 |
+
return -0.05
|
| 9 |
+
|
| 10 |
+
progress = new_quality - old_quality
|
| 11 |
+
ordering_bonus = 0.05 if resolved_dependency_correctly else 0.0
|
| 12 |
+
step_cost = -0.01
|
| 13 |
+
|
| 14 |
+
return round(progress + ordering_bonus + step_cost, 4)
|
inference.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
STDOUT FORMAT (must match exactly):
|
| 3 |
+
[START] task=<task_name> env=data_cleaning_env model=<model_name>
|
| 4 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 5 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
|
| 13 |
+
from env.environment import DataCleaningEnv
|
| 14 |
+
from env.graders import DataCleaningGrader
|
| 15 |
+
from env.models import Action
|
| 16 |
+
|
| 17 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
+
API_BASE_URL = os.getenv("API_BASE_URL")
|
| 19 |
+
MODEL_NAME = os.getenv("MODEL_NAME")
|
| 20 |
+
BENCHMARK = "data_cleaning_env"
|
| 21 |
+
|
| 22 |
+
TASKS = ["basic_cleaning", "moderate_cleaning", "full_pipeline"]
|
| 23 |
+
|
| 24 |
+
SYSTEM_PROMPT = """You are an AI agent performing data cleaning on a tabular dataset.
|
| 25 |
+
|
| 26 |
+
You will receive an observation containing:
|
| 27 |
+
- data_preview: first 5 rows of the current dataset
|
| 28 |
+
- columns: column info (name, dtype, null_count, unique_count)
|
| 29 |
+
- pending_issues: list of issues to fix (each has issue_id, issue_type, column, description, depends_on)
|
| 30 |
+
- resolved_issues: issues already fixed
|
| 31 |
+
- action_history: your previous actions
|
| 32 |
+
- quality_score: current data quality (0.0-1.0)
|
| 33 |
+
- steps_remaining: how many actions you have left
|
| 34 |
+
|
| 35 |
+
You must respond with EXACTLY one JSON object representing your action:
|
| 36 |
+
{
|
| 37 |
+
"action_type": "<one of: fill_missing, drop_duplicates, convert_dtype, normalize_category, create_feature>",
|
| 38 |
+
"column": "<target column name or __all__ for drop_duplicates>",
|
| 39 |
+
"params": {<strategy-specific params>}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
Rules:
|
| 43 |
+
- fill_missing: params must have "strategy" key. Use "mean"/"median"/"zero" for numeric columns, "mode"/"unknown" for categorical.
|
| 44 |
+
- drop_duplicates: column = "__all__", params = {}
|
| 45 |
+
- convert_dtype: params must have "target_dtype" key (one of: int, float, str, bool)
|
| 46 |
+
- normalize_category: params = {}
|
| 47 |
+
- create_feature: params must have "feature_name" key (e.g., "age_group")
|
| 48 |
+
|
| 49 |
+
IMPORTANT: Fix dependencies first! Check the "depends_on" field of each issue. For example, fill missing string values in a column BEFORE converting its dtype.
|
| 50 |
+
|
| 51 |
+
Respond with ONLY the JSON object. No explanation, no markdown, no code blocks."""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def parse_action(response_text: str) -> Action:
|
| 55 |
+
text = response_text.strip()
|
| 56 |
+
if text.startswith("```"):
|
| 57 |
+
parts = text.split("\n", 1)
|
| 58 |
+
text = parts[1] if len(parts) > 1 else text[3:]
|
| 59 |
+
if text.endswith("```"):
|
| 60 |
+
text = text[:-3]
|
| 61 |
+
text = text.strip()
|
| 62 |
+
if text.startswith("json"):
|
| 63 |
+
text = text[4:].strip()
|
| 64 |
+
parsed = json.loads(text)
|
| 65 |
+
return Action(**parsed)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def require_env(name: str, value: str | None) -> str:
|
| 69 |
+
if value:
|
| 70 |
+
return value
|
| 71 |
+
raise RuntimeError(f"Missing required environment variable: {name}")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def safe_log_value(value: str | None) -> str:
|
| 75 |
+
if not value:
|
| 76 |
+
return "null"
|
| 77 |
+
return str(value).replace("\n", "_").replace("\r", "_").replace("\t", "_").replace(" ", "_")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def log_start(task, env, model):
|
| 81 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def log_step(step, action_str, reward, done, error):
|
| 85 |
+
error_val = safe_log_value(error)
|
| 86 |
+
done_val = str(done).lower()
|
| 87 |
+
print(
|
| 88 |
+
f"[STEP] step={step} action={safe_log_value(action_str)} reward={reward:.2f} "
|
| 89 |
+
f"done={done_val} error={error_val}",
|
| 90 |
+
flush=True,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def log_end(success, steps, score, rewards):
|
| 95 |
+
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 96 |
+
success_val = str(success).lower()
|
| 97 |
+
print(f"[END] success={success_val} steps={steps} score={score:.2f} rewards={rewards_str}", flush=True)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def run_task(task_name: str):
|
| 101 |
+
client = OpenAI(
|
| 102 |
+
base_url=require_env("API_BASE_URL", API_BASE_URL),
|
| 103 |
+
api_key=require_env("HF_TOKEN", HF_TOKEN),
|
| 104 |
+
)
|
| 105 |
+
env = DataCleaningEnv(task_name=task_name)
|
| 106 |
+
obs = env.reset()
|
| 107 |
+
log_start(task_name, BENCHMARK, require_env("MODEL_NAME", MODEL_NAME))
|
| 108 |
+
|
| 109 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 110 |
+
rewards_list = []
|
| 111 |
+
step_count = 0
|
| 112 |
+
done = False
|
| 113 |
+
max_possible_steps = obs.steps_remaining
|
| 114 |
+
task_score = 0.0
|
| 115 |
+
|
| 116 |
+
while not done and step_count < max_possible_steps:
|
| 117 |
+
obs_dict = obs.model_dump() if hasattr(obs, "model_dump") else obs.dict()
|
| 118 |
+
messages.append(
|
| 119 |
+
{
|
| 120 |
+
"role": "user",
|
| 121 |
+
"content": f"Current observation:\n{json.dumps(obs_dict, indent=2, default=str)}\n\nChoose your next action.",
|
| 122 |
+
}
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
response = client.chat.completions.create(
|
| 127 |
+
model=require_env("MODEL_NAME", MODEL_NAME),
|
| 128 |
+
messages=messages,
|
| 129 |
+
temperature=0.3,
|
| 130 |
+
max_tokens=200,
|
| 131 |
+
)
|
| 132 |
+
response_text = response.choices[0].message.content or ""
|
| 133 |
+
messages.append({"role": "assistant", "content": response_text})
|
| 134 |
+
|
| 135 |
+
action = parse_action(response_text)
|
| 136 |
+
obs, reward, done, info = env.step(action)
|
| 137 |
+
step_count += 1
|
| 138 |
+
last_error = info.get("error")
|
| 139 |
+
rewards_list.append(reward)
|
| 140 |
+
|
| 141 |
+
action_str = f"{action.action_type}({action.column})"
|
| 142 |
+
log_step(step_count, action_str, reward, done, last_error)
|
| 143 |
+
|
| 144 |
+
except Exception as exc:
|
| 145 |
+
step_count += 1
|
| 146 |
+
rewards_list.append(-0.05)
|
| 147 |
+
log_step(step_count, "parse_error", -0.05, False, str(exc))
|
| 148 |
+
messages.append(
|
| 149 |
+
{
|
| 150 |
+
"role": "user",
|
| 151 |
+
"content": f"Your response could not be parsed. Error: {str(exc)}. Respond with ONLY a valid JSON action object.",
|
| 152 |
+
}
|
| 153 |
+
)
|
| 154 |
+
if step_count >= max_possible_steps:
|
| 155 |
+
break
|
| 156 |
+
|
| 157 |
+
success = hasattr(obs, "pending_issues") and len(obs.pending_issues) == 0
|
| 158 |
+
final_state = obs.model_dump() if hasattr(obs, "model_dump") else obs.dict()
|
| 159 |
+
task_score = DataCleaningGrader().grade(
|
| 160 |
+
final_state,
|
| 161 |
+
{
|
| 162 |
+
"total_issues": final_state["total_issues_at_start"],
|
| 163 |
+
"max_steps": max_possible_steps,
|
| 164 |
+
},
|
| 165 |
+
)
|
| 166 |
+
log_end(success, step_count, task_score, rewards_list)
|
| 167 |
+
return task_score
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def main():
|
| 171 |
+
require_env("HF_TOKEN", HF_TOKEN)
|
| 172 |
+
require_env("API_BASE_URL", API_BASE_URL)
|
| 173 |
+
require_env("MODEL_NAME", MODEL_NAME)
|
| 174 |
+
scores = {}
|
| 175 |
+
for task in TASKS:
|
| 176 |
+
scores[task] = run_task(task)
|
| 177 |
+
return scores
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility exports for OpenEnv tooling."""
|
| 2 |
+
|
| 3 |
+
from env.models import Action, ColumnInfo, Issue, Observation, Reward
|
| 4 |
+
|
| 5 |
+
__all__ = ["Action", "ColumnInfo", "Issue", "Observation", "Reward"]
|
openenv.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: data_cleaning_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 7860
|
| 7 |
+
description: "RL environment for interactive tabular data cleaning and preparation. Agent must identify and fix data quality issues including missing values, duplicates, wrong dtypes, inconsistent categories, and feature creation."
|
| 8 |
+
version: "1.0.0"
|
| 9 |
+
|
| 10 |
+
observation_space:
|
| 11 |
+
type: dict
|
| 12 |
+
description: "Contains data_preview, columns, pending_issues, resolved_issues, action_history, quality_score, steps_remaining"
|
| 13 |
+
|
| 14 |
+
action_space:
|
| 15 |
+
type: dict
|
| 16 |
+
description: "Action with action_type, column, and params fields"
|
| 17 |
+
|
| 18 |
+
reward_range: [-0.05, 1.0]
|
| 19 |
+
|
| 20 |
+
tasks:
|
| 21 |
+
- name: basic_cleaning
|
| 22 |
+
description: "Easy: fill missing values in a small dataset (20 rows, 2 issues)"
|
| 23 |
+
difficulty: easy
|
| 24 |
+
- name: moderate_cleaning
|
| 25 |
+
description: "Medium: handle missing values, duplicates, and wrong dtypes (50 rows, 5 issues in practice)"
|
| 26 |
+
difficulty: medium
|
| 27 |
+
- name: full_pipeline
|
| 28 |
+
description: "Hard: full cleaning pipeline with category normalization and feature creation (100 rows, 10 issues in practice)"
|
| 29 |
+
difficulty: hard
|
pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-data-cleaning"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "OpenEnv environment for interactive tabular data cleaning."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"openenv-core>=0.2.0",
|
| 13 |
+
"fastapi>=0.110.0",
|
| 14 |
+
"openai>=1.0",
|
| 15 |
+
"pydantic>=2.0",
|
| 16 |
+
"uvicorn>=0.30.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.optional-dependencies]
|
| 20 |
+
dev = [
|
| 21 |
+
"httpx>=0.28.0",
|
| 22 |
+
"pytest>=8.0.0",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
[project.scripts]
|
| 26 |
+
server = "server.app:main"
|
| 27 |
+
|
| 28 |
+
[tool.setuptools]
|
| 29 |
+
include-package-data = true
|
| 30 |
+
packages = ["env", "server"]
|
| 31 |
+
py-modules = ["app", "client", "models"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic>=2.0
|
| 2 |
+
openai>=1.0
|
| 3 |
+
uvicorn
|
| 4 |
+
fastapi
|
| 5 |
+
openenv-core>=0.2.0
|
server/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server package for OpenEnv-compatible app entrypoints."""
|
| 2 |
+
|
| 3 |
+
from .app import app, main
|
| 4 |
+
from .environment import DataCleaningEnv
|
| 5 |
+
|
| 6 |
+
__all__ = ["app", "main", "DataCleaningEnv"]
|
server/app.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
from typing import Any, Literal
|
| 5 |
+
|
| 6 |
+
import uvicorn
|
| 7 |
+
from fastapi import Body, FastAPI
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from models import Action, Observation
|
| 11 |
+
|
| 12 |
+
from .environment import DataCleaningEnv
|
| 13 |
+
|
| 14 |
+
TASKS = ["basic_cleaning", "moderate_cleaning", "full_pipeline"]
|
| 15 |
+
ENV_NAME = "data_cleaning_env"
|
| 16 |
+
ENV_DESCRIPTION = (
|
| 17 |
+
"RL environment for interactive tabular data cleaning and preparation. "
|
| 18 |
+
"Agents must fix missing values, duplicates, dtype issues, category inconsistencies, "
|
| 19 |
+
"and derived-feature requirements."
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
app = FastAPI(title="Data Cleaning OpenEnv", version="1.0.0")
|
| 23 |
+
ENV = DataCleaningEnv()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ResetRequest(BaseModel):
|
| 27 |
+
task_name: Literal["basic_cleaning", "moderate_cleaning", "full_pipeline"] = "basic_cleaning"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _metadata() -> dict[str, Any]:
|
| 31 |
+
return {
|
| 32 |
+
"name": ENV_NAME,
|
| 33 |
+
"description": ENV_DESCRIPTION,
|
| 34 |
+
"version": "1.0.0",
|
| 35 |
+
"tasks": TASKS,
|
| 36 |
+
"mode": "simulation",
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.get("/")
|
| 41 |
+
def root() -> dict[str, Any]:
|
| 42 |
+
payload = _metadata()
|
| 43 |
+
payload["status"] = "ok"
|
| 44 |
+
return payload
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@app.get("/health")
|
| 48 |
+
def health() -> dict[str, str]:
|
| 49 |
+
return {"status": "healthy"}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.get("/metadata")
|
| 53 |
+
def metadata() -> dict[str, Any]:
|
| 54 |
+
return _metadata()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@app.get("/tasks")
|
| 58 |
+
def list_tasks() -> dict[str, list[str]]:
|
| 59 |
+
return {"tasks": TASKS}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@app.get("/schema")
|
| 63 |
+
def schema() -> dict[str, Any]:
|
| 64 |
+
observation_schema = Observation.model_json_schema()
|
| 65 |
+
return {
|
| 66 |
+
"action": Action.model_json_schema(),
|
| 67 |
+
"observation": observation_schema,
|
| 68 |
+
"state": observation_schema,
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@app.post("/mcp")
|
| 73 |
+
def mcp(payload: dict[str, Any] = Body(default_factory=dict)) -> dict[str, Any]:
|
| 74 |
+
return {
|
| 75 |
+
"jsonrpc": "2.0",
|
| 76 |
+
"id": payload.get("id"),
|
| 77 |
+
"error": {
|
| 78 |
+
"code": -32601,
|
| 79 |
+
"message": "MCP methods are not implemented for this benchmark.",
|
| 80 |
+
},
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@app.post("/reset")
|
| 85 |
+
def reset(request: ResetRequest | None = None) -> dict[str, Any]:
|
| 86 |
+
effective_request = request or ResetRequest()
|
| 87 |
+
ENV.task_name = effective_request.task_name
|
| 88 |
+
observation = ENV.reset()
|
| 89 |
+
return observation.model_dump()
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.post("/step")
|
| 93 |
+
def step(action: Action) -> dict[str, Any]:
|
| 94 |
+
observation, reward, done, info = ENV.step(action)
|
| 95 |
+
return {
|
| 96 |
+
"observation": observation.model_dump(),
|
| 97 |
+
"reward": reward,
|
| 98 |
+
"done": done,
|
| 99 |
+
"info": info,
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@app.get("/state")
|
| 104 |
+
def state() -> dict[str, Any]:
|
| 105 |
+
if not ENV.dataset:
|
| 106 |
+
ENV.reset()
|
| 107 |
+
return ENV.state().model_dump()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def main(host: str | None = None, port: int | None = None) -> None:
|
| 111 |
+
if host is None or port is None:
|
| 112 |
+
parser = argparse.ArgumentParser()
|
| 113 |
+
parser.add_argument("--host", default="0.0.0.0")
|
| 114 |
+
parser.add_argument("--port", type=int, default=7860)
|
| 115 |
+
args = parser.parse_args()
|
| 116 |
+
host = args.host if host is None else host
|
| 117 |
+
port = args.port if port is None else port
|
| 118 |
+
uvicorn.run(app, host=host, port=port)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Canonical server-side environment entrypoint for the data cleaning benchmark."""
|
| 2 |
+
|
| 3 |
+
from env.environment import DataCleaningEnv
|
| 4 |
+
|
| 5 |
+
__all__ = ["DataCleaningEnv"]
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core>=0.2.0
|
| 2 |
+
fastapi>=0.110.0
|
| 3 |
+
openai>=1.0
|
| 4 |
+
pydantic>=2.0
|
| 5 |
+
uvicorn>=0.30.0
|
test_env.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from fastapi.testclient import TestClient
|
| 5 |
+
|
| 6 |
+
from app import app
|
| 7 |
+
from env.environment import DataCleaningEnv
|
| 8 |
+
from env.graders import DataCleaningGrader
|
| 9 |
+
from env.models import Action
|
| 10 |
+
|
| 11 |
+
ROOT = Path(__file__).resolve().parent
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def assert_invalid_action_consumes_step() -> None:
|
| 15 |
+
env = DataCleaningEnv("basic_cleaning")
|
| 16 |
+
obs = env.reset()
|
| 17 |
+
_, reward, _, info = env.step(
|
| 18 |
+
Action(action_type="convert_dtype", column="age", params={"target_dtype": "int"})
|
| 19 |
+
)
|
| 20 |
+
assert reward == -0.05
|
| 21 |
+
assert info["error"] == "invalid_action"
|
| 22 |
+
assert env.steps_remaining == obs.steps_remaining - 1
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def assert_dependency_gate() -> None:
|
| 26 |
+
env = DataCleaningEnv("moderate_cleaning")
|
| 27 |
+
env.reset()
|
| 28 |
+
_, reward, _, info = env.step(
|
| 29 |
+
Action(action_type="convert_dtype", column="salary", params={"target_dtype": "int"})
|
| 30 |
+
)
|
| 31 |
+
assert reward == -0.05
|
| 32 |
+
assert info["error"] == "invalid_action"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def assert_api_contract() -> None:
|
| 36 |
+
client = TestClient(app)
|
| 37 |
+
|
| 38 |
+
root_response = client.get("/")
|
| 39 |
+
assert root_response.status_code == 200
|
| 40 |
+
assert root_response.json()["name"] == "data_cleaning_env"
|
| 41 |
+
|
| 42 |
+
assert client.get("/health").json()["status"] == "healthy"
|
| 43 |
+
|
| 44 |
+
metadata_response = client.get("/metadata")
|
| 45 |
+
assert metadata_response.status_code == 200
|
| 46 |
+
metadata_payload = metadata_response.json()
|
| 47 |
+
assert metadata_payload["name"] == "data_cleaning_env"
|
| 48 |
+
assert "description" in metadata_payload
|
| 49 |
+
|
| 50 |
+
schema_response = client.get("/schema")
|
| 51 |
+
assert schema_response.status_code == 200
|
| 52 |
+
schema_payload = schema_response.json()
|
| 53 |
+
assert {"action", "observation", "state"} <= set(schema_payload.keys())
|
| 54 |
+
|
| 55 |
+
reset_response = client.post("/reset", json={"task_name": "basic_cleaning"})
|
| 56 |
+
assert reset_response.status_code == 200
|
| 57 |
+
assert "pending_issues" in reset_response.json()
|
| 58 |
+
|
| 59 |
+
step_response = client.post(
|
| 60 |
+
"/step",
|
| 61 |
+
json={"action_type": "fill_missing", "column": "age", "params": {"strategy": "mean"}},
|
| 62 |
+
)
|
| 63 |
+
assert step_response.status_code == 200
|
| 64 |
+
assert {"observation", "reward", "done", "info"} <= set(step_response.json().keys())
|
| 65 |
+
|
| 66 |
+
state_response = client.get("/state")
|
| 67 |
+
assert state_response.status_code == 200
|
| 68 |
+
assert "quality_score" in state_response.json()
|
| 69 |
+
|
| 70 |
+
mcp_response = client.post("/mcp", json={"jsonrpc": "2.0", "id": "smoke"})
|
| 71 |
+
assert mcp_response.status_code == 200
|
| 72 |
+
assert mcp_response.json()["jsonrpc"] == "2.0"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def run_sequence(task_name: str, actions: list[Action], expected_issues: int) -> tuple[dict, float]:
|
| 76 |
+
env = DataCleaningEnv(task_name)
|
| 77 |
+
obs = env.reset()
|
| 78 |
+
assert len(obs.pending_issues) == expected_issues, (task_name, len(obs.pending_issues), expected_issues)
|
| 79 |
+
initial_quality = obs.quality_score
|
| 80 |
+
|
| 81 |
+
for action in actions:
|
| 82 |
+
obs, reward, done, info = env.step(action)
|
| 83 |
+
assert "error" not in info, (task_name, action, info)
|
| 84 |
+
if done:
|
| 85 |
+
break
|
| 86 |
+
|
| 87 |
+
assert obs.quality_score >= initial_quality
|
| 88 |
+
final_state = obs.model_dump()
|
| 89 |
+
config = json.loads((ROOT / "data" / f"{task_name}.json").read_text(encoding="utf-8"))
|
| 90 |
+
score = DataCleaningGrader().grade(
|
| 91 |
+
final_state,
|
| 92 |
+
{
|
| 93 |
+
"total_issues": expected_issues,
|
| 94 |
+
"max_steps": config["max_steps"],
|
| 95 |
+
},
|
| 96 |
+
)
|
| 97 |
+
return final_state, score
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def main() -> None:
|
| 101 |
+
assert_invalid_action_consumes_step()
|
| 102 |
+
assert_dependency_gate()
|
| 103 |
+
assert_api_contract()
|
| 104 |
+
|
| 105 |
+
sequences = {
|
| 106 |
+
"basic_cleaning": (
|
| 107 |
+
[
|
| 108 |
+
Action(action_type="fill_missing", column="age", params={"strategy": "mean"}),
|
| 109 |
+
Action(action_type="fill_missing", column="salary", params={"strategy": "median"}),
|
| 110 |
+
],
|
| 111 |
+
2,
|
| 112 |
+
),
|
| 113 |
+
"moderate_cleaning": (
|
| 114 |
+
[
|
| 115 |
+
Action(action_type="fill_missing", column="age", params={"strategy": "mean"}),
|
| 116 |
+
Action(action_type="fill_missing", column="years_exp", params={"strategy": "median"}),
|
| 117 |
+
Action(action_type="fill_missing", column="salary", params={"strategy": "median"}),
|
| 118 |
+
Action(action_type="convert_dtype", column="salary", params={"target_dtype": "int"}),
|
| 119 |
+
Action(action_type="drop_duplicates", column="__all__", params={}),
|
| 120 |
+
],
|
| 121 |
+
5,
|
| 122 |
+
),
|
| 123 |
+
"full_pipeline": (
|
| 124 |
+
[
|
| 125 |
+
Action(action_type="fill_missing", column="age", params={"strategy": "mean"}),
|
| 126 |
+
Action(action_type="fill_missing", column="years_exp", params={"strategy": "median"}),
|
| 127 |
+
Action(action_type="fill_missing", column="rating", params={"strategy": "mean"}),
|
| 128 |
+
Action(action_type="fill_missing", column="salary", params={"strategy": "median"}),
|
| 129 |
+
Action(action_type="convert_dtype", column="salary", params={"target_dtype": "int"}),
|
| 130 |
+
Action(action_type="convert_dtype", column="rating", params={"target_dtype": "float"}),
|
| 131 |
+
Action(action_type="normalize_category", column="city", params={}),
|
| 132 |
+
Action(action_type="normalize_category", column="department", params={}),
|
| 133 |
+
Action(action_type="create_feature", column="age_group", params={"feature_name": "age_group"}),
|
| 134 |
+
Action(action_type="drop_duplicates", column="__all__", params={}),
|
| 135 |
+
],
|
| 136 |
+
10,
|
| 137 |
+
),
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
for task_name, (actions, expected_issues) in sequences.items():
|
| 141 |
+
final_state, score = run_sequence(task_name, actions, expected_issues)
|
| 142 |
+
pending = len(final_state["pending_issues"])
|
| 143 |
+
resolved = len(final_state["resolved_issues"])
|
| 144 |
+
print(
|
| 145 |
+
f"{task_name}: pending={pending} resolved={resolved} "
|
| 146 |
+
f"steps_remaining={final_state['steps_remaining']} grader_score={score}"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
if __name__ == "__main__":
|
| 151 |
+
main()
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|