Graheet commited on
Commit
2922c03
·
1 Parent(s): 1301c85

Deploy dataops-env Space

Browse files
.dataops_policy_cache.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "patterns": {
3
+ "0fa3f35b7f17ba183b55874ed34f34b9dfd42a96": {
4
+ "actions": {
5
+ "fill_missing(row_id=34, column='city', value='Denver')": {
6
+ "attempts": 1,
7
+ "cumulative_reward": 0.47,
8
+ "failures": 0,
9
+ "last_error": null,
10
+ "progresses": 0,
11
+ "successes": 1
12
+ }
13
+ }
14
+ },
15
+ "11ef7431147857285b2046abba4c938d39542f56": {
16
+ "actions": {
17
+ "normalize_column(column='email')": {
18
+ "attempts": 1,
19
+ "cumulative_reward": 0.2738,
20
+ "failures": 0,
21
+ "last_error": null,
22
+ "progresses": 1,
23
+ "successes": 0
24
+ }
25
+ }
26
+ },
27
+ "468a9ffd1e161a3b120e50a2d283b5a2bea91b78": {
28
+ "actions": {
29
+ "delete_row(row_id=23)": {
30
+ "attempts": 1,
31
+ "cumulative_reward": 0.13,
32
+ "failures": 0,
33
+ "last_error": null,
34
+ "progresses": 1,
35
+ "successes": 0
36
+ }
37
+ }
38
+ },
39
+ "47741633a233a9bfecedd98ec5d8a0cafed7aa58": {
40
+ "actions": {
41
+ "normalize_column(column='city')": {
42
+ "attempts": 1,
43
+ "cumulative_reward": 0.4363,
44
+ "failures": 0,
45
+ "last_error": null,
46
+ "progresses": 0,
47
+ "successes": 1
48
+ }
49
+ }
50
+ },
51
+ "84f1a657f98b24e5c81396ebede79cb10b4bd759": {
52
+ "actions": {
53
+ "remove_duplicate(row_id=33)": {
54
+ "attempts": 1,
55
+ "cumulative_reward": 0.43,
56
+ "failures": 0,
57
+ "last_error": null,
58
+ "progresses": 1,
59
+ "successes": 0
60
+ }
61
+ }
62
+ },
63
+ "a4f3a4b48581add24c1541d266ca386ca1048501": {
64
+ "actions": {
65
+ "normalize_column(column='name')": {
66
+ "attempts": 1,
67
+ "cumulative_reward": 0.2363,
68
+ "failures": 0,
69
+ "last_error": null,
70
+ "progresses": 1,
71
+ "successes": 0
72
+ }
73
+ }
74
+ },
75
+ "b2557fa133ca411bea9c1d016bb1022ef521dd12": {
76
+ "actions": {
77
+ "delete_row(row_id=26)": {
78
+ "attempts": 1,
79
+ "cumulative_reward": 0.2657,
80
+ "failures": 0,
81
+ "last_error": null,
82
+ "progresses": 0,
83
+ "successes": 1
84
+ }
85
+ }
86
+ },
87
+ "bac5c03eaec86e5eca3602c18787e7b8dd71ce77": {
88
+ "actions": {
89
+ "fill_missing(row_id=35, column='email', value='peak.systems@example.com')": {
90
+ "attempts": 1,
91
+ "cumulative_reward": 0.24,
92
+ "failures": 0,
93
+ "last_error": null,
94
+ "progresses": 1,
95
+ "successes": 0
96
+ }
97
+ }
98
+ },
99
+ "c9a1695ddb1fd7a56fcd89d0855f3e6c3cc0539f": {
100
+ "actions": {
101
+ "remove_duplicate(row_id=13)": {
102
+ "attempts": 1,
103
+ "cumulative_reward": 0.3737,
104
+ "failures": 0,
105
+ "last_error": null,
106
+ "progresses": 1,
107
+ "successes": 0
108
+ }
109
+ }
110
+ },
111
+ "e133231c2a2fe412ce1265f027ce61ff2616d497": {
112
+ "actions": {
113
+ "remove_duplicate(row_id=22)": {
114
+ "attempts": 1,
115
+ "cumulative_reward": 0.3443,
116
+ "failures": 0,
117
+ "last_error": null,
118
+ "progresses": 1,
119
+ "successes": 0
120
+ }
121
+ }
122
+ }
123
+ },
124
+ "states": {
125
+ "0cc9851ce3eac00c3a8ba813a0bbe5086b6b28c7": {
126
+ "actions": {
127
+ "fill_missing(row_id=35, column='email', value='peak.systems@example.com')": {
128
+ "attempts": 1,
129
+ "cumulative_reward": 0.24,
130
+ "failures": 0,
131
+ "last_error": null,
132
+ "progresses": 1,
133
+ "successes": 0
134
+ }
135
+ }
136
+ },
137
+ "338f581f2bb3d33b1ca9113338487576d7221075": {
138
+ "actions": {
139
+ "remove_duplicate(row_id=13)": {
140
+ "attempts": 1,
141
+ "cumulative_reward": 0.3737,
142
+ "failures": 0,
143
+ "last_error": null,
144
+ "progresses": 1,
145
+ "successes": 0
146
+ }
147
+ }
148
+ },
149
+ "3641040a720ad30712929dfc63af936f954c7048": {
150
+ "actions": {
151
+ "delete_row(row_id=26)": {
152
+ "attempts": 1,
153
+ "cumulative_reward": 0.2657,
154
+ "failures": 0,
155
+ "last_error": null,
156
+ "progresses": 0,
157
+ "successes": 1
158
+ }
159
+ }
160
+ },
161
+ "4cdcb94d06f82ed5c380e36d4737c98095c1d8e5": {
162
+ "actions": {
163
+ "fill_missing(row_id=34, column='city', value='Denver')": {
164
+ "attempts": 1,
165
+ "cumulative_reward": 0.47,
166
+ "failures": 0,
167
+ "last_error": null,
168
+ "progresses": 0,
169
+ "successes": 1
170
+ }
171
+ }
172
+ },
173
+ "72db3bd5a5ba3a4f64815e20b59ccf3b9ce6b2ad": {
174
+ "actions": {
175
+ "remove_duplicate(row_id=33)": {
176
+ "attempts": 1,
177
+ "cumulative_reward": 0.43,
178
+ "failures": 0,
179
+ "last_error": null,
180
+ "progresses": 1,
181
+ "successes": 0
182
+ }
183
+ }
184
+ },
185
+ "a3e1f7d06da6a2a3bfc3ca13a3a9ae20ff116705": {
186
+ "actions": {
187
+ "delete_row(row_id=23)": {
188
+ "attempts": 1,
189
+ "cumulative_reward": 0.13,
190
+ "failures": 0,
191
+ "last_error": null,
192
+ "progresses": 1,
193
+ "successes": 0
194
+ }
195
+ }
196
+ },
197
+ "be2f2def033a20dab83499d31cbe7ac8ba296d8e": {
198
+ "actions": {
199
+ "normalize_column(column='name')": {
200
+ "attempts": 1,
201
+ "cumulative_reward": 0.2363,
202
+ "failures": 0,
203
+ "last_error": null,
204
+ "progresses": 1,
205
+ "successes": 0
206
+ }
207
+ }
208
+ },
209
+ "c02de8edbfcfa72f061a9ac1827b98c070f1208e": {
210
+ "actions": {
211
+ "remove_duplicate(row_id=22)": {
212
+ "attempts": 1,
213
+ "cumulative_reward": 0.3443,
214
+ "failures": 0,
215
+ "last_error": null,
216
+ "progresses": 1,
217
+ "successes": 0
218
+ }
219
+ }
220
+ },
221
+ "d30ee334a5d05193f192a491cf4e069c1b494b21": {
222
+ "actions": {
223
+ "normalize_column(column='city')": {
224
+ "attempts": 1,
225
+ "cumulative_reward": 0.4363,
226
+ "failures": 0,
227
+ "last_error": null,
228
+ "progresses": 0,
229
+ "successes": 1
230
+ }
231
+ }
232
+ },
233
+ "f1c2df4babd9a79a88196afded5618bb0ce3d72b": {
234
+ "actions": {
235
+ "normalize_column(column='email')": {
236
+ "attempts": 1,
237
+ "cumulative_reward": 0.2738,
238
+ "failures": 0,
239
+ "last_error": null,
240
+ "progresses": 1,
241
+ "successes": 0
242
+ }
243
+ }
244
+ }
245
+ },
246
+ "version": 1
247
+ }
.dockerignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .git/
6
+ .venv/
7
+ venv/
8
+ dist/
9
+ build/
10
+ .pytest_cache/
11
+ .mypy_cache/
12
+ agent-tools/
13
+ terminals/
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV PYTHONPATH=/app
6
+ ENV PORT=7860
7
+
8
+ WORKDIR /app
9
+
10
+ RUN apt-get update && apt-get install -y \
11
+ build-essential \
12
+ curl \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir --upgrade pip && \
17
+ pip install --no-cache-dir -r requirements.txt
18
+
19
+ COPY . .
20
+
21
+ #Non-root user
22
+ RUN useradd -m appuser
23
+ USER appuser
24
+
25
+ EXPOSE 7860
26
+
27
+ CMD ["sh", "-c", "uvicorn server.app:app --host 0.0.0.0 --port ${PORT:-7860}"]
README.md CHANGED
@@ -1,10 +1,344 @@
1
  ---
 
2
  title: Dataops Env
3
  emoji: 📊
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: docker
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+
3
  title: Dataops Env
4
  emoji: 📊
5
  colorFrom: indigo
6
  colorTo: gray
7
  sdk: docker
8
+ app_port: 7860
9
  pinned: false
10
+
11
  ---
12
 
13
+ # `dataops-env`
14
+
15
+ `dataops-env` is an OpenEnv benchmark for training and evaluating agents on
16
+ multi-step data operations work. Instead of a single obvious cleanup action, an
17
+ agent must inspect messy business tables, choose corrective actions in the right
18
+ order, preserve valid-but-unusual records, and know when the table is truly
19
+ ready for validation.
20
+
21
+ It exposes the standard `reset()`, `step(action)`, and `state()` interface,
22
+ ships with a production-ready FastAPI server and Docker image, and includes a
23
+ reproducible OpenAI-compatible baseline runner.
24
+
25
+ ## Benchmark Purpose
26
+
27
+ Many toy data-cleaning tasks reward shallow pattern matching. Real operational
28
+ data work is harder:
29
+
30
+ - duplicates may be safe to remove, but conflicting rows require judgment
31
+ - some malformed values should be normalized, while unusual valid values must be preserved
32
+ - deletion is often the riskiest action, not the default fix
33
+ - agents need partial credit for progress, but strong penalties for repeated mistakes
34
+
35
+ `dataops-env` is designed to capture those decisions in a compact benchmark that
36
+ is still easy to run, validate, and deploy in the OpenEnv ecosystem.
37
+
38
+ ## Why It Feels Real
39
+
40
+ The environment models common enterprise data quality problems:
41
+
42
+ - exact duplicates in customer or vendor master data
43
+ - missing required fields
44
+ - inconsistent casing in names and locations
45
+ - invalid email and phone formats
46
+ - conflicting records for the same real-world entity
47
+ - uniqueness constraints such as shared-email violations
48
+ - trap rows that look suspicious but are actually valid
49
+
50
+ Agents are rewarded for minimal corrective behavior and punished for destructive
51
+ or repetitive actions. That makes the environment useful for both learning and
52
+ evaluation.
53
+
54
+ ## Task Families
55
+
56
+ The benchmark keeps the hackathon-friendly `easy`, `medium`, and `hard` task
57
+ structure, while each family now contains deterministic variants so policies
58
+ cannot overfit a single table.
59
+
60
+ 1. `easy`
61
+ Remove duplicates and fill missing required fields.
62
+ 2. `medium`
63
+ Remove duplicates, normalize casing, and repair invalid emails.
64
+ 3. `hard`
65
+ Resolve conflicts, enforce unique-email constraints, fix invalid formats,
66
+ and preserve valid trap rows.
67
+
68
+ Each task definition includes:
69
+
70
+ - `goal`
71
+ - `difficulty`
72
+ - `variant_id`
73
+ - `required_columns`
74
+ - `hidden_issues`
75
+ - `constraints`
76
+ - `expected_outcome`
77
+ - `max_steps`
78
+
79
+ ## Learning Signals
80
+
81
+ The environment provides both dense rewards and a deterministic final score:
82
+
83
+ - partial rewards for duplicate removal, normalization, and filling missing values
84
+ - step costs and no-progress penalties to discourage random actions
85
+ - escalating penalties for repeated mistakes
86
+ - destructive-action penalties for harmful deletions
87
+ - proactive hints after recurring failures
88
+ - final task scoring on a strict `0.0` to `1.0` scale
89
+
90
+ The final task score and the visible validation failures are produced from the
91
+ same explicit rule set, reducing mismatch between what the agent sees and how it
92
+ is ultimately judged.
93
+
94
+ ## Action Space
95
+
96
+ Agents interact with the environment through a typed `Action` object.
97
+
98
+ Supported action types:
99
+
100
+ - `remove_duplicate`
101
+ Remove one row from an exact duplicate group. Can be called with an explicit
102
+ `row_id`, or the environment can choose the default duplicate target.
103
+ - `fill_missing`
104
+ Fill a missing field on a target row. Requires `column` and `value`, and may
105
+ also include `row_id`.
106
+ - `normalize_column`
107
+ Apply deterministic normalization to a supported column such as `name`,
108
+ `city`, `email`, or `phone`.
109
+ - `delete_row`
110
+ Delete a row when doing so resolves a structural issue like a conflict or a
111
+ uniqueness violation. Requires `row_id`.
112
+ - `validate`
113
+ Signal that the agent believes the table is ready for completion.
114
+ - `noop`
115
+ Explicitly take no action. This is allowed but penalized when unresolved
116
+ issues remain.
117
+
118
+ Typed action schema:
119
+
120
+ - `action_id: Optional[str]`
121
+ - `action_type: Literal["remove_duplicate", "fill_missing", "normalize_column", "delete_row", "validate", "noop"]`
122
+ - `column: Optional[str]`
123
+ - `row_id: Optional[int]`
124
+ - `value: Optional[str]`
125
+
126
+ Validation rules:
127
+
128
+ - `delete_row` requires `row_id`
129
+ - `normalize_column` requires `column`
130
+ - `fill_missing` requires `column` and `value`
131
+
132
+ Example actions:
133
+
134
+ ```json
135
+ {"action_id":"step-001","action_type":"remove_duplicate","row_id":33}
136
+ {"action_id":"step-002","action_type":"fill_missing","row_id":35,"column":"email","value":"peak.systems@example.com"}
137
+ {"action_id":"step-003","action_type":"normalize_column","column":"email"}
138
+ {"action_id":"step-004","action_type":"validate"}
139
+ ```
140
+
141
+ ## Observation Space
142
+
143
+ The environment returns a typed `Observation` object after `reset()` and each
144
+ call to `step()`.
145
+
146
+ Observation fields:
147
+
148
+ - `goal: str`
149
+ Natural-language description of what the agent should accomplish.
150
+ - `table: List[Dict[str, Any]]`
151
+ Current JSON-serializable table snapshot.
152
+ - `issues: List[str]`
153
+ Human-readable unresolved issues and validation failures.
154
+ - `history: List[str]`
155
+ Ordered record of previous actions/events in the current episode.
156
+ - `mistakes: Dict[str, int]`
157
+ Counts of repeated mistake categories tracked during the episode.
158
+ - `hints: List[str]`
159
+ Proactive or reactive guidance derived from issue state and prior failures.
160
+ - `progress: float`
161
+ Normalized progress estimate in `[0.0, 1.0]`.
162
+ - `steps_remaining: int`
163
+ Number of remaining actions before the episode terminates.
164
+
165
+ Example observation shape:
166
+
167
+ ```json
168
+ {
169
+ "goal": "Normalize the dataset by fixing casing, removing duplicates, and correcting invalid email formats.",
170
+ "table": [
171
+ {"row_id": 10, "customer_id": "C100", "name": "jane miller", "city": "new york", "email": "jane.miller@example.com"}
172
+ ],
173
+ "issues": [
174
+ "Rows 11 and 13 are duplicates and only one should remain."
175
+ ],
176
+ "history": [],
177
+ "mistakes": {},
178
+ "hints": [],
179
+ "progress": 0.0,
180
+ "steps_remaining": 9
181
+ }
182
+ ```
183
+
184
+ ## Expected Agent Behavior
185
+
186
+ A strong agent should behave roughly like this:
187
+
188
+ 1. inspect the visible table and unresolved issues
189
+ 2. remove safe duplicates first
190
+ 3. repair missing or malformed values without over-editing valid rows
191
+ 4. resolve structural conflicts carefully, especially in hard tasks
192
+ 5. validate only when the remaining issue list is empty
193
+
194
+ Example successful baseline trace:
195
+
196
+ ```text
197
+ [START] task=medium env=dataops-env model=your-model
198
+ [STEP] step=1 action=remove_duplicate(row_id=13) reward=0.37 done=false error=null
199
+ [STEP] step=2 action=normalize_column(column='email') reward=0.27 done=false error=null
200
+ [STEP] step=3 action=normalize_column(column='name') reward=0.24 done=false error=null
201
+ [STEP] step=4 action=normalize_column(column='city') reward=0.44 done=true error=null
202
+ [END] success=true steps=4 rewards=0.37,0.27,0.24,0.44
203
+ ```
204
+
205
+ ## Project Layout
206
+
207
+ - `env.py`: core `DataOpsEnv` implementation
208
+ - `task.py`: task families and deterministic variants
209
+ - `models.py`: typed `Action`, `Observation`, and `Reward` contracts
210
+ - `grader.py`: dense rewards, explicit validation checks, and final task scoring
211
+ - `server/app.py`: FastAPI runtime API
212
+ - `inference.py`: hybrid heuristic/model baseline runner
213
+ - `openenv.yaml`: OpenEnv metadata and task registration
214
+ - `pyproject.toml`: package metadata and server script entry point
215
+ - `Dockerfile`: production container image
216
+
217
+ ## Local Setup
218
+
219
+ ```bash
220
+ pip install -r requirements.txt
221
+ openenv validate
222
+ ```
223
+
224
+ Run the FastAPI server:
225
+
226
+ ```bash
227
+ python -m server.app
228
+ ```
229
+
230
+ By default, the local server runs on port `8000`.
231
+
232
+ Or use the packaged entry point:
233
+
234
+ ```bash
235
+ server
236
+ ```
237
+
238
+ ## API
239
+
240
+ Health check:
241
+
242
+ ```bash
243
+ curl http://localhost:8000/health
244
+ ```
245
+
246
+ Create a session with an optional seed and task selection:
247
+
248
+ ```bash
249
+ curl -X POST http://localhost:8000/reset \
250
+ -H "Content-Type: application/json" \
251
+ -d '{"seed": 0, "task_name": "easy"}'
252
+ ```
253
+
254
+ Step the environment:
255
+
256
+ ```bash
257
+ curl -X POST "http://localhost:8000/step" \
258
+ -H "Content-Type: application/json" \
259
+ -d '{"action_id":"step-001","action_type":"validate"}'
260
+ ```
261
+
262
+ Read internal state:
263
+
264
+ ```bash
265
+ curl "http://localhost:8000/state"
266
+ ```
267
+
268
+ ## Baseline Inference
269
+
270
+ The baseline runner now combines deterministic local planning with optional
271
+ model arbitration. The local planner proposes ranked candidate actions from the
272
+ visible table state, and the model is constrained to choose only from those
273
+ candidates. This avoids many common failure modes such as invalid actions,
274
+ repeated no-op loops, and reckless deletion choices.
275
+
276
+ Run it with an OpenAI-compatible endpoint:
277
+
278
+ ```bash
279
+ set HF_TOKEN=your_token
280
+ set MODEL_NAME=your_model
281
+ set API_BASE_URL=https://router.huggingface.co/v1
282
+ python inference.py
283
+ ```
284
+
285
+ Key properties:
286
+
287
+ - strict `[START]`, `[STEP]`, and `[END]` output formatting
288
+ - fixed task ordering for reproducibility
289
+ - retry logic for invalid or blocked model suggestions
290
+ - strong heuristic fallback when the model is unavailable
291
+ - action filtering based on prior no-progress or errorful behavior
292
+
293
+ ## Docker
294
+
295
+ Build:
296
+
297
+ ```bash
298
+ docker build -t dataops-env .
299
+ ```
300
+
301
+ Run locally:
302
+
303
+ ```bash
304
+ docker run -p 8000:8000 dataops-env
305
+ ```
306
+
307
+ ## Hugging Face Spaces Notes
308
+
309
+ For Hugging Face `Docker` Spaces, the container should normally listen on port
310
+ `7860`, or the Space must be explicitly configured to expect a different
311
+ internal port.
312
+
313
+ If you keep the current container on port `8000`, make sure your Space is
314
+ configured with:
315
+
316
+ ```yaml
317
+ app_port: 8000
318
+ ```
319
+
320
+ If you want the simplest Hugging Face Spaces setup, change the container to use
321
+ port `7860` instead:
322
+
323
+ ```dockerfile
324
+ EXPOSE 7860
325
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
326
+ ```
327
+
328
+ Then local Docker testing would become:
329
+
330
+ ```bash
331
+ docker run -p 7860:7860 dataops-env
332
+ curl http://localhost:7860/health
333
+ ```
334
+
335
+ ## Submission Notes
336
+
337
+ - `openenv validate` passes
338
+ - the server and Docker image run successfully
339
+ - the packaged benchmark supports multi-mode deployment
340
+ - the default baseline now completes the public task families deterministically
341
+
342
+ Leaderboard performance will still depend on the quality of the external model,
343
+ but the repository is now structured and documented like a serious benchmark
344
+ submission rather than a starter scaffold.
__pycache__/env.cpython-313.pyc ADDED
Binary file (37.9 kB). View file
 
__pycache__/grader.cpython-313.pyc ADDED
Binary file (23.8 kB). View file
 
__pycache__/inference.cpython-313.pyc ADDED
Binary file (47.4 kB). View file
 
__pycache__/models.cpython-313.pyc ADDED
Binary file (5.49 kB). View file
 
__pycache__/task.cpython-313.pyc ADDED
Binary file (13.4 kB). View file
 
env.py ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenEnv environment entrypoint for ``dataops-gym``.
2
+
3
+ This module is responsible for declaring top-level environment metadata,
4
+ configuration wiring, and lifecycle integration points for the OpenEnv runtime.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from copy import deepcopy
10
+ import random
11
+ import re
12
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple
13
+
14
+ from grader import grade_step_details, grade_task_result, task_failure_messages
15
+ from models import Action, Observation
16
+ from task import (
17
+ HiddenIssue,
18
+ TaskDefinition,
19
+ easy_cleaning_task,
20
+ hard_conflict_resolution_task,
21
+ medium_normalization_task,
22
+ )
23
+
24
+
25
+ EMAIL_PATTERN = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
26
+
27
+
28
+ class DataOpsEnv:
29
+ """Deterministic multi-step data-cleaning environment for OpenEnv."""
30
+
31
+ def __init__(self, seed: int = 0, task_name: Optional[str] = None) -> None:
32
+ """Initialize the environment with deterministic task sampling."""
33
+
34
+ self._seed = seed
35
+ self._rng = random.Random(seed)
36
+ self._task_registry: List[Tuple[str, Any]] = [
37
+ ("easy", easy_cleaning_task),
38
+ ("medium", medium_normalization_task),
39
+ ("hard", hard_conflict_resolution_task),
40
+ ]
41
+ self._fixed_task_name = task_name
42
+ self._global_mistake_memory: Dict[str, int] = {}
43
+ self._state_data: Dict[str, Any] = {}
44
+
45
+ def reset(self) -> Observation:
46
+ """Load a random task, initialize episode state, and return an observation."""
47
+
48
+ task_name, task_factory = self._select_task_factory()
49
+ variant_count = max(1, int(getattr(task_factory, "variant_count", 1)))
50
+ variant_index = self._rng.randrange(variant_count)
51
+ task_definition = deepcopy(task_factory(variant=variant_index))
52
+ initial_table = deepcopy(task_definition["initial_table"])
53
+ initial_table_by_row_id = self._table_by_row_id(initial_table)
54
+
55
+ self._state_data = {
56
+ "seed": self._seed,
57
+ "task_name": task_name,
58
+ "task_variant": task_definition.get("variant_id", f"{task_name}_variant_{variant_index}"),
59
+ "task": task_definition,
60
+ "table": initial_table,
61
+ "history": [],
62
+ "mistakes": {},
63
+ "mistake_memory": [],
64
+ "hints": [],
65
+ "steps_taken": 0,
66
+ "steps_remaining": task_definition["max_steps"],
67
+ "done": False,
68
+ "last_reward_components": {},
69
+ "last_info": {},
70
+ "last_task_score": 0.0,
71
+ "initial_issue_count": 1,
72
+ "initial_table_by_row_id": initial_table_by_row_id,
73
+ }
74
+ initial_issue_count = len(self._current_issue_messages(initial_table, task_definition))
75
+ self._state_data["initial_issue_count"] = max(1, initial_issue_count)
76
+ return self._build_observation()
77
+
78
+ def step(
79
+ self, action: Action | Mapping[str, Any]
80
+ ) -> Tuple[Observation, float, bool, Dict[str, Any]]:
81
+ """Apply one action, score it, update state, and return a gym-style step tuple."""
82
+
83
+ if not self._state_data:
84
+ raise RuntimeError("Environment must be reset before calling step().")
85
+ if self._state_data.get("done", False):
86
+ raise RuntimeError("Episode is finished. Call reset() before stepping again.")
87
+
88
+ parsed_action, action_error = self._coerce_action(action)
89
+ task_definition: TaskDefinition = self._state_data["task"]
90
+ table_before = deepcopy(self._state_data["table"])
91
+ issues_before = self._current_issue_messages(table_before, task_definition)
92
+
93
+ result: Dict[str, Any] = {
94
+ "mistake_keys": [],
95
+ "error_type": "general",
96
+ }
97
+
98
+ if action_error is not None:
99
+ parsed_action = Action(action_type="noop")
100
+ result["noop"] = True
101
+ result["unnecessary_action"] = True
102
+ result["error_type"] = "invalid_action"
103
+ result["mistake_keys"].append("invalid_action:general")
104
+ history_entry = f"invalid_action({action_error})"
105
+ else:
106
+ history_entry = self._apply_action(parsed_action, result)
107
+
108
+ self._state_data["history"].append(history_entry)
109
+ self._state_data["steps_taken"] += 1
110
+ self._state_data["steps_remaining"] = max(
111
+ 0, task_definition["max_steps"] - self._state_data["steps_taken"]
112
+ )
113
+
114
+ table_after = deepcopy(self._state_data["table"])
115
+ issues_after = self._current_issue_messages(table_after, task_definition)
116
+ self._populate_result_signals(
117
+ parsed_action,
118
+ table_before,
119
+ table_after,
120
+ issues_before,
121
+ issues_after,
122
+ result,
123
+ )
124
+
125
+ reward, components = grade_step_details(
126
+ self._state_data, parsed_action.model_dump(), result
127
+ )
128
+ self._record_mistake_memory(parsed_action, result)
129
+ self._update_hints(result, issues_after)
130
+
131
+ done = not issues_after or self._state_data["steps_remaining"] <= 0
132
+ self._state_data["done"] = done
133
+ task_score = grade_task_result(
134
+ task_definition, self._state_data["table"], self._state_data
135
+ )
136
+ self._state_data["last_task_score"] = task_score
137
+
138
+ observation = self._build_observation()
139
+ info = {
140
+ "task_name": self._state_data["task_name"],
141
+ "task_variant": self._state_data["task_variant"],
142
+ "difficulty": task_definition["difficulty"],
143
+ "reward_components": components,
144
+ "mistakes": deepcopy(self._state_data["mistakes"]),
145
+ "hints": list(self._state_data["hints"]),
146
+ "issues_remaining": len(issues_after),
147
+ "done_reason": "resolved" if not issues_after else "max_steps" if done else None,
148
+ "task_score": task_score,
149
+ "result": deepcopy(result),
150
+ }
151
+ self._state_data["last_reward_components"] = deepcopy(components)
152
+ self._state_data["last_info"] = deepcopy(info)
153
+ return observation, reward, done, info
154
+
155
+ def state(self) -> Dict[str, Any]:
156
+ """Return a deep copy of the internal environment state."""
157
+
158
+ return deepcopy(self._state_data)
159
+
160
+ def close(self) -> None:
161
+ """Release environment state for callers using explicit lifecycle cleanup."""
162
+
163
+ self._state_data = {}
164
+
165
+ def _select_task_factory(self) -> Tuple[str, Any]:
166
+ """Pick the configured task factory deterministically."""
167
+
168
+ if self._fixed_task_name is None:
169
+ return self._rng.choice(self._task_registry)
170
+
171
+ for task_name, task_factory in self._task_registry:
172
+ if self._fixed_task_name in {task_name, task_factory.__name__}:
173
+ return task_name, task_factory
174
+
175
+ raise ValueError(f"Unknown task_name: {self._fixed_task_name}")
176
+
177
+ def _coerce_action(
178
+ self, action: Action | Mapping[str, Any]
179
+ ) -> Tuple[Optional[Action], Optional[str]]:
180
+ """Convert user input into an ``Action`` model without raising outward."""
181
+
182
+ if isinstance(action, Action):
183
+ return action, None
184
+
185
+ try:
186
+ return Action(**dict(action)), None
187
+ except Exception as exc: # pragma: no cover - defensive runtime boundary
188
+ return None, str(exc)
189
+
190
+ def _apply_action(self, action: Action, result: MutableMapping[str, Any]) -> str:
191
+ """Apply a single action to the current table and capture side effects."""
192
+
193
+ if action.action_type == "noop":
194
+ result["noop"] = True
195
+ result["mistake_keys"].append(f"{action.action_type}:noop")
196
+ return self._format_history(action)
197
+
198
+ if action.action_type == "remove_duplicate":
199
+ self._remove_duplicate(action, result)
200
+ return self._format_history(action)
201
+
202
+ if action.action_type == "delete_row":
203
+ self._delete_row(action, result)
204
+ return self._format_history(action)
205
+
206
+ if action.action_type == "fill_missing":
207
+ self._fill_missing(action, result)
208
+ return self._format_history(action)
209
+
210
+ if action.action_type == "normalize_column":
211
+ self._normalize_column(action, result)
212
+ return self._format_history(action)
213
+
214
+ if action.action_type == "validate":
215
+ return self._format_history(action)
216
+
217
+ result["unnecessary_action"] = True
218
+ result["error_type"] = "unsupported_action"
219
+ result["mistake_keys"].append(f"{action.action_type}:unsupported_action")
220
+ return self._format_history(action)
221
+
222
+ def _remove_duplicate(
223
+ self, action: Action, result: MutableMapping[str, Any]
224
+ ) -> None:
225
+ """Remove a duplicate row when the target belongs to a duplicate issue."""
226
+
227
+ duplicate_groups = [
228
+ issue
229
+ for issue in self._state_data["task"]["hidden_issues"]
230
+ if issue["type"] == "duplicate" and self._is_issue_unresolved(issue, self._state_data["table"])
231
+ ]
232
+ if not duplicate_groups:
233
+ result["unnecessary_action"] = True
234
+ result["error_type"] = "no_duplicate_available"
235
+ return
236
+
237
+ candidate_rows = set(duplicate_groups[0].get("rows", []))
238
+ target_row_id = action.row_id or max(candidate_rows)
239
+
240
+ if target_row_id not in candidate_rows:
241
+ result["unnecessary_action"] = True
242
+ result["error_type"] = "invalid_duplicate_target"
243
+ return
244
+
245
+ removed = self._remove_row_by_id(target_row_id)
246
+ if not removed:
247
+ result["unnecessary_action"] = True
248
+ result["error_type"] = "missing_row"
249
+
250
+ def _delete_row(self, action: Action, result: MutableMapping[str, Any]) -> None:
251
+ """Delete a row and mark destructive behavior when the target is unsafe."""
252
+
253
+ target_row = self._get_row_by_id(action.row_id)
254
+ if target_row is None:
255
+ result["unnecessary_action"] = True
256
+ result["error_type"] = "missing_row"
257
+ return
258
+
259
+ if self._row_is_protected(action.row_id):
260
+ result["wrong_deletion"] = True
261
+ result["destructive_action"] = True
262
+ result["error_type"] = "protected_row"
263
+ result["mistake_keys"].append(f"{action.action_type}:protected_row")
264
+ elif not self._row_belongs_to_removable_issue(action.row_id):
265
+ result["wrong_deletion"] = True
266
+ result["destructive_action"] = True
267
+ result["error_type"] = "wrong_deletion"
268
+ result["mistake_keys"].append(f"{action.action_type}:wrong_deletion")
269
+
270
+ self._remove_row_by_id(action.row_id)
271
+
272
+ def _fill_missing(self, action: Action, result: MutableMapping[str, Any]) -> None:
273
+ """Fill a missing field on the target row or the first matching missing cell."""
274
+
275
+ target_row = self._resolve_missing_target_row(action.row_id, action.column)
276
+ if target_row is None or action.column is None:
277
+ result["unnecessary_action"] = True
278
+ result["error_type"] = "missing_target"
279
+ return
280
+
281
+ if not self._is_missing_value(target_row.get(action.column)):
282
+ result["unnecessary_action"] = True
283
+ result["error_type"] = "cell_not_missing"
284
+ return
285
+
286
+ target_row[action.column] = action.value
287
+
288
+ def _normalize_column(self, action: Action, result: MutableMapping[str, Any]) -> None:
289
+ """Normalize a supported column using deterministic, minimal edits."""
290
+
291
+ if action.column is None:
292
+ result["unnecessary_action"] = True
293
+ result["error_type"] = "missing_column"
294
+ return
295
+
296
+ changed_rows = 0
297
+ for row in self._state_data["table"]:
298
+ original = row.get(action.column)
299
+ normalized = self._normalized_value(action.column, original)
300
+ if normalized is None or normalized == original:
301
+ continue
302
+
303
+ # Keep trap rows stable unless the value is actually invalid.
304
+ if self._row_is_protected(row.get("row_id")) and self._value_is_valid(
305
+ action.column, original
306
+ ):
307
+ continue
308
+
309
+ row[action.column] = normalized
310
+ changed_rows += 1
311
+
312
+ if changed_rows == 0:
313
+ result["unnecessary_action"] = True
314
+ result["error_type"] = "no_normalization_needed"
315
+
316
+ def _populate_result_signals(
317
+ self,
318
+ action: Action,
319
+ table_before: List[Dict[str, Any]],
320
+ table_after: List[Dict[str, Any]],
321
+ issues_before: List[str],
322
+ issues_after: List[str],
323
+ result: MutableMapping[str, Any],
324
+ ) -> None:
325
+ """Derive reward signals from before/after state transitions."""
326
+
327
+ task_definition: TaskDefinition = self._state_data["task"]
328
+ hidden_before = self._issue_type_counts(table_before, task_definition)
329
+ hidden_after = self._issue_type_counts(table_after, task_definition)
330
+
331
+ if hidden_after.get("duplicate", 0) < hidden_before.get("duplicate", 0):
332
+ result["correct_duplicate_removal"] = True
333
+
334
+ if hidden_after.get("missing_value", 0) < hidden_before.get("missing_value", 0):
335
+ result["fixed_missing_value"] = True
336
+
337
+ normalization_before = hidden_before.get("inconsistent_casing", 0) + hidden_before.get(
338
+ "invalid_format", 0
339
+ )
340
+ normalization_after = hidden_after.get("inconsistent_casing", 0) + hidden_after.get(
341
+ "invalid_format", 0
342
+ )
343
+ if (
344
+ action.action_type == "normalize_column"
345
+ and normalization_after < normalization_before
346
+ ):
347
+ result["correct_normalization"] = True
348
+
349
+ if action.action_type == "validate" and not issues_after:
350
+ result["validation_success"] = True
351
+ result["task_completed"] = True
352
+
353
+ if not issues_after:
354
+ result["task_completed"] = True
355
+
356
+ issue_delta = max(0, len(issues_before) - len(issues_after))
357
+ result["progress_delta"] = round(
358
+ issue_delta / float(self._state_data["initial_issue_count"]),
359
+ 4,
360
+ )
361
+
362
+ if issue_delta > 0 and any(self._state_data["mistakes"].values()):
363
+ result["corrected_previous_mistake"] = True
364
+
365
+ if action.action_type == "noop" and issues_after:
366
+ result["unnecessary_action"] = True
367
+ result["error_type"] = result.get("error_type", "noop")
368
+
369
+ def _build_observation(self) -> Observation:
370
+ """Construct the typed observation returned to callers."""
371
+
372
+ task_definition: TaskDefinition = self._state_data["task"]
373
+ issue_messages = self._current_issue_messages(self._state_data["table"], task_definition)
374
+ progress = self._compute_progress(issue_messages)
375
+ return Observation(
376
+ goal=task_definition["goal"],
377
+ table=deepcopy(self._state_data["table"]),
378
+ issues=issue_messages,
379
+ history=list(self._state_data["history"]),
380
+ mistakes=deepcopy(self._state_data["mistakes"]),
381
+ hints=list(self._state_data["hints"]),
382
+ progress=progress,
383
+ steps_remaining=int(self._state_data["steps_remaining"]),
384
+ )
385
+
386
+ def _compute_progress(self, issue_messages: List[str]) -> float:
387
+ """Estimate progress from the current unresolved issue count."""
388
+
389
+ baseline = float(self._state_data["initial_issue_count"])
390
+ remaining = min(len(issue_messages), self._state_data["initial_issue_count"])
391
+ resolved_fraction = 1.0 - (remaining / baseline)
392
+ return round(max(0.0, min(1.0, resolved_fraction)), 4)
393
+
394
+ def _current_issue_messages(
395
+ self, table: List[Dict[str, Any]], task_definition: TaskDefinition
396
+ ) -> List[str]:
397
+ """Return unresolved issue descriptions plus validation-rule failures."""
398
+
399
+ messages: List[str] = []
400
+ for issue in task_definition["hidden_issues"]:
401
+ if self._is_issue_unresolved(issue, table):
402
+ description = issue.get("description")
403
+ if description:
404
+ messages.append(description)
405
+
406
+ messages.extend(self._validation_failures(table, task_definition))
407
+ return messages
408
+
409
+ def _validation_failures(
410
+ self, table: List[Dict[str, Any]], task_definition: TaskDefinition
411
+ ) -> List[str]:
412
+ """Evaluate rule-based outcome constraints beyond the hidden issue list."""
413
+
414
+ return task_failure_messages(task_definition, table, self._state_data)
415
+
416
+ def _issue_type_counts(
417
+ self, table: List[Dict[str, Any]], task_definition: TaskDefinition
418
+ ) -> Dict[str, int]:
419
+ """Count unresolved hidden issues by type."""
420
+
421
+ counts: Dict[str, int] = {}
422
+ for issue in task_definition["hidden_issues"]:
423
+ if self._is_issue_unresolved(issue, table):
424
+ issue_type = issue["type"]
425
+ counts[issue_type] = counts.get(issue_type, 0) + 1
426
+ return counts
427
+
428
+ def _is_issue_unresolved(self, issue: HiddenIssue, table: List[Dict[str, Any]]) -> bool:
429
+ """Determine whether a hidden issue is still unresolved."""
430
+
431
+ issue_type = issue["type"]
432
+ table_by_row_id = self._table_by_row_id(table)
433
+
434
+ if issue_type == "valid_trap":
435
+ return False
436
+
437
+ if issue_type in {"duplicate", "conflict"}:
438
+ rows = issue.get("rows", [])
439
+ return all(row_id in table_by_row_id for row_id in rows)
440
+
441
+ if issue_type == "missing_value":
442
+ row = table_by_row_id.get(issue.get("row"))
443
+ column = issue.get("column")
444
+ return row is not None and column is not None and self._is_missing_value(row.get(column))
445
+
446
+ if issue_type == "inconsistent_casing":
447
+ column = issue.get("column")
448
+ return any(
449
+ row_id in table_by_row_id
450
+ and self._needs_title_case(str(table_by_row_id[row_id].get(column, "")))
451
+ for row_id in issue.get("rows", [])
452
+ )
453
+
454
+ if issue_type == "invalid_format":
455
+ row = table_by_row_id.get(issue.get("row"))
456
+ column = issue.get("column")
457
+ return row is not None and column is not None and not self._value_is_valid(
458
+ column, row.get(column)
459
+ )
460
+
461
+ if issue_type == "constraint_violation" and issue.get("constraint") == "unique_email":
462
+ rows = issue.get("rows", [])
463
+ emails = [
464
+ table_by_row_id[row_id].get("email")
465
+ for row_id in rows
466
+ if row_id in table_by_row_id
467
+ ]
468
+ return len(emails) != len(set(emails))
469
+
470
+ return False
471
+
472
+ def _update_hints(self, result: Mapping[str, Any], issues_after: List[str]) -> None:
473
+ """Add deterministic hints when the agent stalls or accumulates mistakes."""
474
+
475
+ if not issues_after:
476
+ return
477
+
478
+ global_wrong_deletion_count = sum(
479
+ count
480
+ for key, count in self._global_mistake_memory.items()
481
+ if key == "wrong_deletion" or key.endswith(":wrong_deletion")
482
+ )
483
+ if global_wrong_deletion_count >= 3:
484
+ hint = (
485
+ "You are repeatedly deleting valid rows. Try resolving issues "
486
+ "instead of deleting."
487
+ )
488
+ if hint not in self._state_data["hints"]:
489
+ self._state_data["hints"].append(hint)
490
+
491
+ total_mistakes = sum(self._state_data["mistakes"].values())
492
+ should_hint = bool(result.get("unnecessary_action")) or bool(
493
+ result.get("wrong_deletion")
494
+ ) or total_mistakes >= 2 or float(result.get("progress_delta", 0.0)) == 0.0
495
+
496
+ if not should_hint:
497
+ return
498
+
499
+ next_hint = self._build_hint(issues_after[0])
500
+ if next_hint not in self._state_data["hints"]:
501
+ self._state_data["hints"].append(next_hint)
502
+
503
+ def _build_hint(self, issue_message: str) -> str:
504
+ """Map unresolved issue descriptions to small, actionable hints."""
505
+
506
+ lowered = issue_message.lower()
507
+ if "duplicate" in lowered:
508
+ return "Look for rows that describe the same entity and keep only one representative record."
509
+ if "missing" in lowered:
510
+ return "A required field is still empty. Fill the missing value instead of deleting the row."
511
+ if "email" in lowered and "format" in lowered:
512
+ return "Normalize only the invalid email values; valid addresses should be preserved."
513
+ if "phone" in lowered:
514
+ return "Repair only phone values that are actually malformed."
515
+ if "title-case" in lowered or "casing" in lowered:
516
+ return "Normalize text columns to a consistent title-case style."
517
+ if "unchanged" in lowered:
518
+ return "Some unusual-looking rows are valid traps and should be preserved."
519
+ return "Focus on the first unresolved issue and prefer minimal corrective actions."
520
+
521
+ def _record_mistake_memory(
522
+ self, action: Action, result: Mapping[str, Any]
523
+ ) -> None:
524
+ """Persist mistake events so hinting can look at prior failures."""
525
+
526
+ for key, count in self._state_data["mistakes"].items():
527
+ if count <= 0:
528
+ continue
529
+ if action.action_id:
530
+ memory_entry = f"{action.action_id}:{key}:{count}"
531
+ else:
532
+ memory_entry = f"{action.action_type}:{key}:{count}"
533
+ if memory_entry not in self._state_data["mistake_memory"]:
534
+ self._state_data["mistake_memory"].append(memory_entry)
535
+
536
+ self._global_mistake_memory[key] = (
537
+ self._global_mistake_memory.get(key, 0) + 1
538
+ )
539
+ category_key = key.split(":")[-1]
540
+ self._global_mistake_memory[category_key] = (
541
+ self._global_mistake_memory.get(category_key, 0) + 1
542
+ )
543
+
544
+ if result.get("destructive_action"):
545
+ entry = f"{action.action_type}:destructive_action"
546
+ if entry not in self._state_data["mistake_memory"]:
547
+ self._state_data["mistake_memory"].append(entry)
548
+
549
+ def _resolve_missing_target_row(
550
+ self, row_id: Optional[int], column: Optional[str]
551
+ ) -> Optional[Dict[str, Any]]:
552
+ """Choose the requested row or the first matching missing-value row."""
553
+
554
+ if row_id is not None:
555
+ return self._get_row_by_id(row_id)
556
+
557
+ if column is None:
558
+ return None
559
+
560
+ for row in self._state_data["table"]:
561
+ if self._is_missing_value(row.get(column)):
562
+ return row
563
+ return None
564
+
565
+ def _normalized_value(self, column: str, value: Any) -> Any:
566
+ """Return a normalized value for supported columns."""
567
+
568
+ if not isinstance(value, str):
569
+ return value
570
+
571
+ if column in {"name", "city"}:
572
+ return value.title()
573
+
574
+ if column == "email" and not self._is_valid_email(value):
575
+ normalized = value.strip().lower()
576
+ normalized = normalized.replace("[at]", "@").replace(" at ", "@")
577
+ if "@" not in normalized and normalized.endswith(".example.com"):
578
+ normalized = normalized.replace(".example.com", "@example.com", 1)
579
+ if "@" in normalized and "." not in normalized.split("@", 1)[1]:
580
+ normalized = normalized + ".com"
581
+ return normalized
582
+
583
+ if column == "phone" and not self._is_valid_phone(value):
584
+ digits = re.sub(r"\D", "", value)
585
+ if len(digits) == 11 and digits.startswith("1"):
586
+ digits = digits[1:]
587
+ if len(digits) == 10:
588
+ return f"{digits[0:3]}-{digits[3:6]}-{digits[6:10]}"
589
+ return value
590
+
591
+ def _value_is_valid(self, column: str, value: Any) -> bool:
592
+ """Validate known column types used by the tasks."""
593
+
594
+ if value is None:
595
+ return False
596
+ if column == "email":
597
+ return self._is_valid_email(str(value))
598
+ if column == "phone":
599
+ return self._is_valid_phone(str(value))
600
+ if column in {"name", "city"}:
601
+ return not self._needs_title_case(str(value))
602
+ return True
603
+
604
+ def _is_valid_email(self, value: str) -> bool:
605
+ """Return whether the supplied email string looks valid."""
606
+
607
+ return bool(EMAIL_PATTERN.match(value.strip()))
608
+
609
+ def _is_valid_phone(self, value: str) -> bool:
610
+ """Return whether the supplied phone value is valid for this environment."""
611
+
612
+ digits = re.sub(r"\D", "", value)
613
+ return len(digits) == 10 or (len(digits) == 11 and digits.startswith("1"))
614
+
615
+ def _needs_title_case(self, value: str) -> bool:
616
+ """Detect whether a string still needs title-case normalization."""
617
+
618
+ cleaned = value.strip()
619
+ return bool(cleaned) and cleaned != cleaned.title()
620
+
621
+ def _has_missing_required_values(
622
+ self, table: Iterable[Dict[str, Any]], required_columns: Iterable[str]
623
+ ) -> bool:
624
+ """Check whether any required field remains missing."""
625
+
626
+ for row in table:
627
+ for column in required_columns:
628
+ if self._is_missing_value(row.get(column)):
629
+ return True
630
+ return False
631
+
632
+ def _has_duplicates(self, table: Iterable[Dict[str, Any]], column: str) -> bool:
633
+ """Check whether a column contains duplicate non-empty values."""
634
+
635
+ values = [row.get(column) for row in table if row.get(column) not in (None, "")]
636
+ return len(values) != len(set(values))
637
+
638
+ def _column_has_invalid_email(
639
+ self, table: Iterable[Dict[str, Any]], column: str
640
+ ) -> bool:
641
+ """Check whether any remaining email value is invalid."""
642
+
643
+ return any(
644
+ row.get(column) not in (None, "") and not self._is_valid_email(str(row.get(column)))
645
+ for row in table
646
+ )
647
+
648
+ def _column_has_invalid_phone(
649
+ self, table: Iterable[Dict[str, Any]], column: str
650
+ ) -> bool:
651
+ """Check whether any remaining phone value is invalid."""
652
+
653
+ return any(
654
+ row.get(column) not in (None, "") and not self._is_valid_phone(str(row.get(column)))
655
+ for row in table
656
+ )
657
+
658
+ def _column_needs_title_case(
659
+ self, table: Iterable[Dict[str, Any]], column: str
660
+ ) -> bool:
661
+ """Check whether any remaining value still violates title-case normalization."""
662
+
663
+ return any(
664
+ isinstance(row.get(column), str) and self._needs_title_case(str(row.get(column)))
665
+ for row in table
666
+ )
667
+
668
+ def _row_has_changed_from_initial(
669
+ self, row_id: int, current_table: List[Dict[str, Any]]
670
+ ) -> bool:
671
+ """Check whether a protected row has changed relative to the task start."""
672
+
673
+ current_row = self._table_by_row_id(current_table).get(row_id)
674
+ initial_row = self._state_data["initial_table_by_row_id"].get(row_id)
675
+ if current_row is None or initial_row is None:
676
+ return True
677
+ return current_row != initial_row
678
+
679
+ def _row_is_protected(self, row_id: Optional[int]) -> bool:
680
+ """Return whether a row is marked as a valid trap in the current task."""
681
+
682
+ if row_id is None:
683
+ return False
684
+ for issue in self._state_data["task"]["hidden_issues"]:
685
+ if issue["type"] == "valid_trap" and issue.get("row") == row_id:
686
+ return True
687
+ return False
688
+
689
+ def _row_belongs_to_removable_issue(self, row_id: Optional[int]) -> bool:
690
+ """Return whether deleting a row could plausibly resolve a structural issue."""
691
+
692
+ if row_id is None:
693
+ return False
694
+ for issue in self._state_data["task"]["hidden_issues"]:
695
+ if issue["type"] in {"duplicate", "conflict", "constraint_violation"} and row_id in issue.get(
696
+ "rows", []
697
+ ):
698
+ return True
699
+ return False
700
+
701
+ def _remove_row_by_id(self, row_id: Optional[int]) -> bool:
702
+ """Remove a row by id and report whether a row was deleted."""
703
+
704
+ if row_id is None:
705
+ return False
706
+ table = self._state_data["table"]
707
+ for index, row in enumerate(table):
708
+ if row.get("row_id") == row_id:
709
+ del table[index]
710
+ return True
711
+ return False
712
+
713
+ def _get_row_by_id(self, row_id: Optional[int]) -> Optional[Dict[str, Any]]:
714
+ """Return a mutable row reference by id."""
715
+
716
+ if row_id is None:
717
+ return None
718
+ for row in self._state_data["table"]:
719
+ if row.get("row_id") == row_id:
720
+ return row
721
+ return None
722
+
723
+ def _table_by_row_id(self, table: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
724
+ """Index a table by row id."""
725
+
726
+ return {
727
+ int(row["row_id"]): deepcopy(row)
728
+ for row in table
729
+ if row.get("row_id") is not None
730
+ }
731
+
732
+ def _is_missing_value(self, value: Any) -> bool:
733
+ """Return whether a cell should be treated as missing."""
734
+
735
+ return value is None or value == ""
736
+
737
+ def _format_history(self, action: Action) -> str:
738
+ """Return a compact history entry for the applied action."""
739
+
740
+ details = []
741
+ if action.row_id is not None:
742
+ details.append(f"row_id={action.row_id}")
743
+ if action.column is not None:
744
+ details.append(f"column={action.column}")
745
+ if action.value is not None:
746
+ details.append(f"value={action.value}")
747
+ detail_text = ", ".join(details)
748
+ return f"{action.action_type}({detail_text})" if detail_text else action.action_type
749
+
750
+
751
+ class DataOpsGymEnv(DataOpsEnv):
752
+ """Compatibility wrapper matching the configured OpenEnv entrypoint."""
753
+
754
+ pass
755
+
756
+ __all__ = ["DataOpsEnv", "DataOpsGymEnv"]
grader.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluation and grading interfaces for ``dataops-gym``.
2
+
3
+ This module is responsible for validating outputs, scoring task results, and
4
+ capturing assessment metadata independently from task execution logic.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from typing import Any, Dict, Iterable, Mapping, MutableMapping, Optional, Tuple
11
+
12
+
13
+ # Dense reward values are intentionally small and additive so the agent receives
14
+ # feedback for intermediate progress without requiring full task completion.
15
+ CORRECT_DUPLICATE_REMOVAL_REWARD = 0.3
16
+ CORRECT_NORMALIZATION_REWARD = 0.2
17
+ FIX_MISSING_VALUE_REWARD = 0.2
18
+ VALIDATION_SUCCESS_REWARD = 0.2
19
+ EFFICIENCY_BONUS = 0.2
20
+ RECOVERY_BONUS = 0.25
21
+ STEP_PENALTY = -0.02
22
+ PROGRESS_REWARD_SCALE = 0.3
23
+
24
+ # Penalties are split into:
25
+ # 1. a direct penalty for the current bad action, and
26
+ # 2. an escalating repetition penalty if the same mistake keeps happening.
27
+ WRONG_DELETION_PENALTY = -0.3
28
+ UNNECESSARY_ACTION_PENALTY = -0.1
29
+ NOOP_PENALTY = -0.05
30
+ DESTRUCTIVE_ACTION_PENALTY = -0.4
31
+
32
+ FIRST_REPEAT_PENALTY = -0.1
33
+ SECOND_REPEAT_PENALTY = -0.2
34
+ THIRD_OR_MORE_REPEAT_PENALTY = -0.4
35
+ EMAIL_PATTERN = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
36
+
37
+
38
+ def detect_repeated_mistake(mistakes: Mapping[str, int], mistake_key: str) -> int:
39
+ """Return how many times a mistake has already occurred before this step."""
40
+
41
+ return int(mistakes.get(mistake_key, 0))
42
+
43
+
44
+ def track_mistake(state: MutableMapping[str, Any], mistake_key: str) -> int:
45
+ """Update the mistake counter in state and return the new occurrence count."""
46
+
47
+ mistakes = state.setdefault("mistakes", {})
48
+ if not isinstance(mistakes, dict):
49
+ raise ValueError("state['mistakes'] must be a dictionary for mistake tracking")
50
+
51
+ current_count = int(mistakes.get(mistake_key, 0))
52
+ new_count = current_count + 1
53
+ mistakes[mistake_key] = new_count
54
+ return new_count
55
+
56
+
57
+ def repeated_mistake_penalty(occurrence_count: int) -> float:
58
+ """Return the escalating penalty for repeated mistakes."""
59
+
60
+ if occurrence_count <= 1:
61
+ return FIRST_REPEAT_PENALTY
62
+ if occurrence_count == 2:
63
+ return SECOND_REPEAT_PENALTY
64
+ return THIRD_OR_MORE_REPEAT_PENALTY
65
+
66
+
67
+ def _to_bool(mapping: Mapping[str, Any], key: str) -> bool:
68
+ """Normalize truthy result flags into deterministic boolean checks."""
69
+
70
+ return bool(mapping.get(key, False))
71
+
72
+
73
+ def _mistake_key(
74
+ action: Mapping[str, Any],
75
+ result: Mapping[str, Any],
76
+ fallback_key: str,
77
+ ) -> str:
78
+ """Build an action-specific mistake key with a safe fallback."""
79
+
80
+ action_type = action.get("action_type")
81
+ error_type = result.get("error_type", "general")
82
+
83
+ if action_type:
84
+ return f"{action_type}:{error_type}"
85
+ return fallback_key
86
+
87
+
88
+ def _clamp_reward(value: float) -> float:
89
+ """Keep rewards in the required [-1.0, 1.0] range."""
90
+
91
+ return max(-1.0, min(1.0, round(value, 4)))
92
+
93
+
94
+ def _clamp_score(value: float) -> float:
95
+ """Keep task-level scores in the required [0.0, 1.0] range."""
96
+
97
+ return max(0.0, min(1.0, round(value, 4)))
98
+
99
+
100
+ def _is_missing_value(value: Any) -> bool:
101
+ """Return whether a cell should be considered missing."""
102
+
103
+ return value is None or value == ""
104
+
105
+
106
+ def _is_valid_email(value: str) -> bool:
107
+ """Validate email formatting used by task graders."""
108
+
109
+ return bool(EMAIL_PATTERN.match(value.strip()))
110
+
111
+
112
+ def _is_valid_phone(value: str) -> bool:
113
+ """Validate phone formatting used by task graders."""
114
+
115
+ digits = re.sub(r"\D", "", value)
116
+ return len(digits) == 10 or (len(digits) == 11 and digits.startswith("1"))
117
+
118
+
119
+ def _needs_title_case(value: str) -> bool:
120
+ """Return whether text still violates title-case normalization."""
121
+
122
+ cleaned = value.strip()
123
+ return bool(cleaned) and cleaned != cleaned.title()
124
+
125
+
126
+ def _has_duplicates(table: Iterable[Dict[str, Any]], column: str) -> bool:
127
+ """Check whether a column contains duplicate non-empty values."""
128
+
129
+ values = [row.get(column) for row in table if row.get(column) not in (None, "")]
130
+ return len(values) != len(set(values))
131
+
132
+
133
+ def _table_by_row_id(table: Iterable[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
134
+ """Index a table by ``row_id`` for deterministic issue evaluation."""
135
+
136
+ return {
137
+ int(row["row_id"]): dict(row)
138
+ for row in table
139
+ if row.get("row_id") is not None
140
+ }
141
+
142
+
143
+ def _is_issue_resolved(issue: Mapping[str, Any], table_by_row_id: Dict[int, Dict[str, Any]]) -> bool:
144
+ """Return whether a structured hidden issue has been resolved."""
145
+
146
+ issue_type = issue.get("type")
147
+
148
+ if issue_type == "valid_trap":
149
+ return True
150
+
151
+ if issue_type in {"duplicate", "conflict"}:
152
+ rows = issue.get("rows", [])
153
+ return not all(row_id in table_by_row_id for row_id in rows)
154
+
155
+ if issue_type == "missing_value":
156
+ row = table_by_row_id.get(issue.get("row"))
157
+ column = issue.get("column")
158
+ return row is None or column is None or not _is_missing_value(row.get(column))
159
+
160
+ if issue_type == "inconsistent_casing":
161
+ column = issue.get("column")
162
+ rows = issue.get("rows", [])
163
+ return not any(
164
+ row_id in table_by_row_id
165
+ and isinstance(table_by_row_id[row_id].get(column), str)
166
+ and _needs_title_case(str(table_by_row_id[row_id].get(column)))
167
+ for row_id in rows
168
+ )
169
+
170
+ if issue_type == "invalid_format":
171
+ row = table_by_row_id.get(issue.get("row"))
172
+ column = issue.get("column")
173
+ if row is None or column is None:
174
+ return True
175
+ value = row.get(column)
176
+ if column == "email":
177
+ return _is_valid_email(str(value))
178
+ if column == "phone":
179
+ return _is_valid_phone(str(value))
180
+ return True
181
+
182
+ if issue_type == "constraint_violation" and issue.get("constraint") == "unique_email":
183
+ rows = issue.get("rows", [])
184
+ emails = [
185
+ table_by_row_id[row_id].get("email")
186
+ for row_id in rows
187
+ if row_id in table_by_row_id
188
+ ]
189
+ return len(emails) == len(set(emails))
190
+
191
+ return True
192
+
193
+
194
+ def _task_check_results(
195
+ task_definition: Mapping[str, Any],
196
+ table: Iterable[Dict[str, Any]],
197
+ state: Optional[Mapping[str, Any]] = None,
198
+ ) -> list[Dict[str, Any]]:
199
+ """Build explicit pass/fail checks for final grading and validation."""
200
+
201
+ rows = [dict(row) for row in table]
202
+ table_by_row_id = _table_by_row_id(rows)
203
+ expected_outcome = dict(task_definition.get("expected_outcome", {}))
204
+ checks: list[Dict[str, Any]] = []
205
+
206
+ expected_row_count = expected_outcome.get("expected_row_count")
207
+ if expected_row_count is not None:
208
+ checks.append(
209
+ {
210
+ "name": "expected_row_count",
211
+ "passed": len(rows) == expected_row_count,
212
+ "message": f"Expected exactly {expected_row_count} rows in the cleaned table.",
213
+ }
214
+ )
215
+
216
+ expected_row_range = expected_outcome.get("expected_row_count_range")
217
+ if expected_row_range is not None:
218
+ checks.append(
219
+ {
220
+ "name": "expected_row_count_range",
221
+ "passed": expected_row_range["min"] <= len(rows) <= expected_row_range["max"],
222
+ "message": (
223
+ "Expected the cleaned table to contain between "
224
+ f"{expected_row_range['min']} and {expected_row_range['max']} rows."
225
+ ),
226
+ }
227
+ )
228
+
229
+ required_columns = expected_outcome.get(
230
+ "required_non_null_columns", task_definition.get("required_columns", [])
231
+ )
232
+ if required_columns:
233
+ checks.append(
234
+ {
235
+ "name": "required_non_null_columns",
236
+ "passed": not any(
237
+ _is_missing_value(row.get(column))
238
+ for row in rows
239
+ for column in required_columns
240
+ ),
241
+ "message": "Required columns must be populated for all remaining rows.",
242
+ }
243
+ )
244
+
245
+ for unique_column in expected_outcome.get("unique_by", []):
246
+ checks.append(
247
+ {
248
+ "name": f"unique_by:{unique_column}",
249
+ "passed": not _has_duplicates(rows, unique_column),
250
+ "message": f"Values in '{unique_column}' must remain unique.",
251
+ }
252
+ )
253
+
254
+ for column, rule in expected_outcome.get("normalized_columns", {}).items():
255
+ if rule == "title_case":
256
+ checks.append(
257
+ {
258
+ "name": f"normalized_column:{column}",
259
+ "passed": not any(
260
+ isinstance(row.get(column), str)
261
+ and _needs_title_case(str(row.get(column)))
262
+ for row in rows
263
+ ),
264
+ "message": f"Column '{column}' should use a consistent title-case style.",
265
+ }
266
+ )
267
+
268
+ for column, rule in expected_outcome.get("format_rules", {}).items():
269
+ if rule == "valid_email":
270
+ checks.append(
271
+ {
272
+ "name": f"valid_email:{column}",
273
+ "passed": not any(
274
+ row.get(column) not in (None, "")
275
+ and not _is_valid_email(str(row.get(column)))
276
+ for row in rows
277
+ ),
278
+ "message": "All remaining email values must use a valid email format.",
279
+ }
280
+ )
281
+ if rule == "normalized_phone":
282
+ checks.append(
283
+ {
284
+ "name": f"normalized_phone:{column}",
285
+ "passed": not any(
286
+ row.get(column) not in (None, "")
287
+ and not _is_valid_phone(str(row.get(column)))
288
+ for row in rows
289
+ ),
290
+ "message": "All remaining phone values must use a consistent valid format.",
291
+ }
292
+ )
293
+
294
+ initial_rows = {}
295
+ if state is not None:
296
+ initial_rows = dict(state.get("initial_table_by_row_id", {}))
297
+
298
+ for row_id in expected_outcome.get("must_preserve_valid_rows", []):
299
+ current_row = table_by_row_id.get(row_id)
300
+ checks.append(
301
+ {
302
+ "name": f"preserve_valid_row:{row_id}",
303
+ "passed": current_row is not None and current_row == initial_rows.get(row_id),
304
+ "message": f"Valid row {row_id} should remain logically unchanged.",
305
+ }
306
+ )
307
+
308
+ for row_group in expected_outcome.get("exactly_one_of_rows", []):
309
+ surviving = [row_id for row_id in row_group if row_id in table_by_row_id]
310
+ checks.append(
311
+ {
312
+ "name": f"exactly_one_of_rows:{','.join(str(row_id) for row_id in row_group)}",
313
+ "passed": len(surviving) == 1,
314
+ "message": f"Exactly one of rows {row_group} should remain in the cleaned table.",
315
+ }
316
+ )
317
+
318
+ for row_id in expected_outcome.get("rows_must_survive", []):
319
+ checks.append(
320
+ {
321
+ "name": f"rows_must_survive:{row_id}",
322
+ "passed": row_id in table_by_row_id,
323
+ "message": f"Row {row_id} must still be present in the cleaned table.",
324
+ }
325
+ )
326
+
327
+ for row_id in expected_outcome.get("rows_must_be_removed", []):
328
+ checks.append(
329
+ {
330
+ "name": f"rows_must_be_removed:{row_id}",
331
+ "passed": row_id not in table_by_row_id,
332
+ "message": f"Row {row_id} should not remain in the cleaned table.",
333
+ }
334
+ )
335
+
336
+ for issue in task_definition.get("hidden_issues", []):
337
+ if issue.get("type") == "valid_trap":
338
+ continue
339
+ message = issue.get("description") or f"Issue '{issue.get('type')}' must be resolved."
340
+ checks.append(
341
+ {
342
+ "name": f"hidden_issue:{issue.get('type')}",
343
+ "passed": _is_issue_resolved(issue, table_by_row_id),
344
+ "message": message,
345
+ }
346
+ )
347
+
348
+ return checks
349
+
350
+
351
+ def _calculate_reward(
352
+ state: MutableMapping[str, Any],
353
+ action: Mapping[str, Any],
354
+ result: MutableMapping[str, Any],
355
+ ) -> float:
356
+ """Compute the deterministic scalar reward for a single environment step."""
357
+
358
+ reward = 0.0
359
+
360
+ # Every step incurs a small cost so the agent is encouraged to solve the
361
+ # task quickly instead of exploring indefinitely.
362
+ reward += STEP_PENALTY
363
+
364
+ # Intermediate rewards encourage the agent to make progress even when the
365
+ # dataset is not fully clean yet.
366
+ if _to_bool(result, "correct_duplicate_removal"):
367
+ reward += CORRECT_DUPLICATE_REMOVAL_REWARD
368
+
369
+ if _to_bool(result, "correct_normalization"):
370
+ reward += CORRECT_NORMALIZATION_REWARD
371
+
372
+ if _to_bool(result, "fixed_missing_value") or _to_bool(
373
+ result, "fixing_missing_values"
374
+ ):
375
+ reward += FIX_MISSING_VALUE_REWARD
376
+
377
+ if _to_bool(result, "validation_success"):
378
+ reward += VALIDATION_SUCCESS_REWARD
379
+
380
+ if _to_bool(result, "corrected_previous_mistake"):
381
+ reward += RECOVERY_BONUS
382
+
383
+ if _to_bool(result, "noop"):
384
+ reward += NOOP_PENALTY
385
+
386
+ if _to_bool(result, "destructive_action"):
387
+ reward += DESTRUCTIVE_ACTION_PENALTY
388
+
389
+ # Progress-based shaping provides a smoother learning signal for partial
390
+ # improvement, even when a step does not fully resolve a visible issue.
391
+ progress_delta = float(result.get("progress_delta", 0.0))
392
+ progress_delta = max(0.0, min(1.0, progress_delta))
393
+ reward += progress_delta * PROGRESS_REWARD_SCALE
394
+
395
+ # Explicitly penalize steps that fail to improve task progress so agents do
396
+ # not learn that random but harmless actions are equivalent to useful ones.
397
+ if progress_delta == 0.0:
398
+ reward -= 0.05
399
+
400
+ # Direct penalties handle obviously harmful moves. Repetition is tracked
401
+ # separately so the same bad behavior becomes more expensive over time.
402
+ if _to_bool(result, "wrong_deletion"):
403
+ reward += WRONG_DELETION_PENALTY
404
+ mistake_key = _mistake_key(action, result, "wrong_deletion")
405
+ occurrence_count = track_mistake(state, mistake_key)
406
+ reward += repeated_mistake_penalty(occurrence_count)
407
+
408
+ if _to_bool(result, "unnecessary_action"):
409
+ reward += UNNECESSARY_ACTION_PENALTY
410
+ mistake_key = _mistake_key(action, result, "unnecessary_action")
411
+ occurrence_count = track_mistake(state, mistake_key)
412
+ reward += repeated_mistake_penalty(occurrence_count)
413
+
414
+ # Support arbitrary custom mistake keys in addition to the built-in ones.
415
+ for mistake_key in result.get("mistake_keys", []):
416
+ if mistake_key not in {"wrong_deletion", "unnecessary_action"}:
417
+ occurrence_count = track_mistake(state, str(mistake_key))
418
+ reward += repeated_mistake_penalty(occurrence_count)
419
+
420
+ # Reward early completion only when the task finishes with steps still
421
+ # available. This creates a simple deterministic efficiency incentive.
422
+ if _to_bool(result, "task_completed") and int(state.get("steps_remaining", 0)) > 0:
423
+ reward += EFFICIENCY_BONUS
424
+
425
+ return _clamp_reward(reward)
426
+
427
+
428
+ def grade_step(
429
+ state: MutableMapping[str, Any],
430
+ action: Mapping[str, Any],
431
+ result: MutableMapping[str, Any],
432
+ ) -> float:
433
+ """Compute a deterministic dense reward for a single environment step."""
434
+
435
+ return _calculate_reward(state, action, result)
436
+
437
+
438
+ def grade_step_details(
439
+ state: MutableMapping[str, Any],
440
+ action: Mapping[str, Any],
441
+ result: MutableMapping[str, Any],
442
+ ) -> Tuple[float, Dict[str, Any]]:
443
+ """Compute reward plus a structured component breakdown for debugging."""
444
+
445
+ previous_mistakes = {
446
+ key: int(value)
447
+ for key, value in state.get("mistakes", {}).items()
448
+ }
449
+ reward = grade_step(state, action, result)
450
+
451
+ wrong_deletion_repeat_penalty = 0.0
452
+ if result.get("wrong_deletion"):
453
+ mistake_key = _mistake_key(action, result, "wrong_deletion")
454
+ occurrence_count = int(state.get("mistakes", {}).get(mistake_key, 0))
455
+ if occurrence_count > int(previous_mistakes.get(mistake_key, 0)):
456
+ wrong_deletion_repeat_penalty = repeated_mistake_penalty(occurrence_count)
457
+
458
+ unnecessary_repeat_penalty = 0.0
459
+ if result.get("unnecessary_action"):
460
+ mistake_key = _mistake_key(action, result, "unnecessary_action")
461
+ occurrence_count = int(state.get("mistakes", {}).get(mistake_key, 0))
462
+ if occurrence_count > int(previous_mistakes.get(mistake_key, 0)):
463
+ unnecessary_repeat_penalty = repeated_mistake_penalty(occurrence_count)
464
+
465
+ components: Dict[str, Any] = {
466
+ "step_penalty": STEP_PENALTY,
467
+ "duplicate_reward": (
468
+ CORRECT_DUPLICATE_REMOVAL_REWARD
469
+ if result.get("correct_duplicate_removal")
470
+ else 0.0
471
+ ),
472
+ "normalization_reward": (
473
+ CORRECT_NORMALIZATION_REWARD
474
+ if result.get("correct_normalization")
475
+ else 0.0
476
+ ),
477
+ "missing_value_reward": (
478
+ FIX_MISSING_VALUE_REWARD if result.get("fixed_missing_value") else 0.0
479
+ ),
480
+ "validation_reward": (
481
+ VALIDATION_SUCCESS_REWARD if result.get("validation_success") else 0.0
482
+ ),
483
+ "penalties": {
484
+ "wrong_deletion": (
485
+ WRONG_DELETION_PENALTY if result.get("wrong_deletion") else 0.0
486
+ ),
487
+ "unnecessary_action": (
488
+ UNNECESSARY_ACTION_PENALTY if result.get("unnecessary_action") else 0.0
489
+ ),
490
+ "wrong_deletion_repeat": wrong_deletion_repeat_penalty,
491
+ "unnecessary_action_repeat": unnecessary_repeat_penalty,
492
+ "noop": NOOP_PENALTY if result.get("noop") else 0.0,
493
+ "destructive_action": (
494
+ DESTRUCTIVE_ACTION_PENALTY
495
+ if result.get("destructive_action")
496
+ else 0.0
497
+ ),
498
+ },
499
+ "progress_reward": round(
500
+ max(0.0, min(1.0, float(result.get("progress_delta", 0.0))))
501
+ * PROGRESS_REWARD_SCALE,
502
+ 4,
503
+ ),
504
+ "recovery_bonus": (
505
+ RECOVERY_BONUS if result.get("corrected_previous_mistake") else 0.0
506
+ ),
507
+ "efficiency_bonus": (
508
+ EFFICIENCY_BONUS
509
+ if result.get("task_completed") and int(state.get("steps_remaining", 0)) > 0
510
+ else 0.0
511
+ ),
512
+ }
513
+
514
+ if float(result.get("progress_delta", 0.0)) == 0.0:
515
+ components["no_progress_penalty"] = -0.05
516
+
517
+ result["reward_components"] = components
518
+ result["reward_total"] = reward
519
+ return reward, components
520
+
521
+
522
+ def grade_task_result(
523
+ task_definition: Mapping[str, Any],
524
+ table: Iterable[Dict[str, Any]],
525
+ state: Optional[Mapping[str, Any]] = None,
526
+ ) -> float:
527
+ """Compute a deterministic final task score between 0.0 and 1.0."""
528
+
529
+ checks = _task_check_results(task_definition, table, state)
530
+ if not checks:
531
+ return 0.0
532
+ return _clamp_score(
533
+ sum(1.0 for check in checks if check["passed"]) / len(checks)
534
+ )
535
+
536
+
537
+ def task_failure_messages(
538
+ task_definition: Mapping[str, Any],
539
+ table: Iterable[Dict[str, Any]],
540
+ state: Optional[Mapping[str, Any]] = None,
541
+ ) -> list[str]:
542
+ """Return explicit failure messages for unresolved outcome checks."""
543
+
544
+ return [
545
+ str(check["message"])
546
+ for check in _task_check_results(task_definition, table, state)
547
+ if not bool(check["passed"])
548
+ ]
549
+
550
+
551
+ def grade_easy_cleaning_task(
552
+ task_definition: Mapping[str, Any],
553
+ table: Iterable[Dict[str, Any]],
554
+ state: Optional[Mapping[str, Any]] = None,
555
+ ) -> float:
556
+ """Grade the easy cleaning task on a 0.0–1.0 scale."""
557
+
558
+ return grade_task_result(task_definition, table, state)
559
+
560
+
561
+ def grade_medium_normalization_task(
562
+ task_definition: Mapping[str, Any],
563
+ table: Iterable[Dict[str, Any]],
564
+ state: Optional[Mapping[str, Any]] = None,
565
+ ) -> float:
566
+ """Grade the medium normalization task on a 0.0–1.0 scale."""
567
+
568
+ return grade_task_result(task_definition, table, state)
569
+
570
+
571
+ def grade_hard_conflict_resolution_task(
572
+ task_definition: Mapping[str, Any],
573
+ table: Iterable[Dict[str, Any]],
574
+ state: Optional[Mapping[str, Any]] = None,
575
+ ) -> float:
576
+ """Grade the hard conflict-resolution task on a 0.0–1.0 scale."""
577
+
578
+ return grade_task_result(task_definition, table, state)
579
+
580
+
581
+ __all__ = [
582
+ "detect_repeated_mistake",
583
+ "grade_step",
584
+ "grade_step_details",
585
+ "grade_task_result",
586
+ "task_failure_messages",
587
+ "grade_easy_cleaning_task",
588
+ "grade_medium_normalization_task",
589
+ "grade_hard_conflict_resolution_task",
590
+ "repeated_mistake_penalty",
591
+ "track_mistake",
592
+ ]
inference.py ADDED
@@ -0,0 +1,989 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Inference entrypoints for ``dataops-gym``.
2
+
3
+ This runner keeps the hackathon-required OpenAI-compatible model interface, but
4
+ adds a stronger local planner so baseline behavior is still competitive and
5
+ reproducible when the model is weak, unavailable, or partially aligned.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+ from collections import Counter, defaultdict
12
+ import hashlib
13
+ import json
14
+ import os
15
+ import re
16
+ import textwrap
17
+ from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple
18
+
19
+ from openai import OpenAI
20
+
21
+ from env import DataOpsEnv
22
+
23
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
24
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen3-VL-30B-A3B-Instruct:novita")
25
+ HF_TOKEN = os.getenv("HF_TOKEN")
26
+ BENCHMARK = os.getenv("OPENENV_BENCHMARK", "dataops-env")
27
+ MAX_STEPS = 10
28
+ TEMPERATURE = 0.0
29
+ MAX_TOKENS = 160
30
+ MODEL_RETRIES = 2
31
+ FALLBACK_ACTION = "noop()"
32
+ ACTION_PREFIX_RE = re.compile(r"^(action|next action)\s*[:\-]\s*", re.IGNORECASE)
33
+ EMAIL_PATTERN = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
34
+ TASK_ORDER = ["easy", "medium", "hard"]
35
+ IDENTIFIER_COLUMNS = ("customer_id", "vendor_id", "partner_id")
36
+ POLICY_CACHE_PATH = os.getenv("POLICY_CACHE_PATH", ".dataops_policy_cache.json")
37
+ POLICY_CACHE_VERSION = 1
38
+
39
+ SYSTEM_PROMPT = textwrap.dedent(
40
+ """
41
+ You control a data-cleaning environment.
42
+ Reply with exactly one action string and nothing else.
43
+
44
+ Only choose from the candidate actions provided by the user prompt.
45
+ Favor actions that remove visible issues quickly and avoid actions that were
46
+ already blocked because they caused errors or no progress.
47
+ Use single quotes for string arguments.
48
+ """
49
+ ).strip()
50
+
51
+
52
+ class PolicyMemory:
53
+ """Persistent lightweight experience cache used across episodes and runs."""
54
+
55
+ def __init__(self, path: str) -> None:
56
+ self.path = path
57
+ self.data: Dict[str, Any] = {
58
+ "version": POLICY_CACHE_VERSION,
59
+ "states": {},
60
+ "patterns": {},
61
+ }
62
+ self._load()
63
+
64
+ def _load(self) -> None:
65
+ """Load cache from disk if it exists and is compatible."""
66
+
67
+ if not os.path.exists(self.path):
68
+ return
69
+ try:
70
+ with open(self.path, "r", encoding="utf-8") as handle:
71
+ payload = json.load(handle)
72
+ except (OSError, json.JSONDecodeError):
73
+ return
74
+ if not isinstance(payload, dict):
75
+ return
76
+ if int(payload.get("version", 0)) != POLICY_CACHE_VERSION:
77
+ return
78
+ self.data = payload
79
+
80
+ def save(self) -> None:
81
+ """Persist the current cache contents to disk."""
82
+
83
+ temp_path = f"{self.path}.tmp"
84
+ with open(temp_path, "w", encoding="utf-8") as handle:
85
+ json.dump(self.data, handle, indent=2, sort_keys=True)
86
+ os.replace(temp_path, self.path)
87
+
88
+ def _bucket(self, bucket_name: str, key: str) -> Dict[str, Any]:
89
+ """Return the mutable bucket for an exact state or a problem pattern."""
90
+
91
+ return self.data.setdefault(bucket_name, {}).setdefault(key, {"actions": {}})
92
+
93
+ def _action_stats(self, bucket_name: str, key: str, action_text: str) -> Dict[str, Any]:
94
+ """Return mutable stats for an action within a memory bucket."""
95
+
96
+ actions = self._bucket(bucket_name, key).setdefault("actions", {})
97
+ return actions.setdefault(
98
+ action_text,
99
+ {
100
+ "attempts": 0,
101
+ "successes": 0,
102
+ "progresses": 0,
103
+ "failures": 0,
104
+ "cumulative_reward": 0.0,
105
+ "last_error": None,
106
+ },
107
+ )
108
+
109
+ def update(
110
+ self,
111
+ *,
112
+ state_key: str,
113
+ pattern_key: str,
114
+ action_text: str,
115
+ reward: float,
116
+ progress_delta: float,
117
+ error: Optional[str],
118
+ done: bool,
119
+ task_score: float,
120
+ ) -> None:
121
+ """Record one step outcome for exact-state and problem-pattern memory."""
122
+
123
+ was_success = bool(done and task_score >= 0.95 and error is None)
124
+ made_progress = bool(progress_delta > 0.0 or reward > 0.0)
125
+ was_failure = bool(error is not None or (progress_delta == 0.0 and reward <= 0.0))
126
+
127
+ for bucket_name, key in (("states", state_key), ("patterns", pattern_key)):
128
+ stats = self._action_stats(bucket_name, key, action_text)
129
+ stats["attempts"] += 1
130
+ stats["cumulative_reward"] = round(
131
+ float(stats["cumulative_reward"]) + float(reward),
132
+ 4,
133
+ )
134
+ stats["last_error"] = error
135
+ if was_success:
136
+ stats["successes"] += 1
137
+ elif made_progress:
138
+ stats["progresses"] += 1
139
+ if was_failure:
140
+ stats["failures"] += 1
141
+
142
+ def _combined_stats(self, state_key: str, pattern_key: str, action_text: str) -> Dict[str, float]:
143
+ """Merge exact-state and pattern-level stats into one weighted view."""
144
+
145
+ combined = {
146
+ "attempts": 0.0,
147
+ "successes": 0.0,
148
+ "progresses": 0.0,
149
+ "failures": 0.0,
150
+ "cumulative_reward": 0.0,
151
+ }
152
+ for bucket_name, key, weight in (
153
+ ("states", state_key, 1.0),
154
+ ("patterns", pattern_key, 0.5),
155
+ ):
156
+ stats = self.data.get(bucket_name, {}).get(key, {}).get("actions", {}).get(action_text)
157
+ if not isinstance(stats, dict):
158
+ continue
159
+ for field in combined:
160
+ combined[field] += float(stats.get(field, 0.0)) * weight
161
+ return combined
162
+
163
+ def score_action(self, state_key: str, pattern_key: str, action_text: str) -> float:
164
+ """Score a candidate action using remembered prior outcomes."""
165
+
166
+ stats = self._combined_stats(state_key, pattern_key, action_text)
167
+ attempts = max(1.0, stats["attempts"])
168
+ average_reward = stats["cumulative_reward"] / attempts
169
+ return round(
170
+ (stats["successes"] * 3.0)
171
+ + (stats["progresses"] * 1.25)
172
+ + average_reward
173
+ - (stats["failures"] * 2.0),
174
+ 4,
175
+ )
176
+
177
+ def blocked_actions(self, state_key: str, pattern_key: str) -> set[str]:
178
+ """Return actions that repeatedly failed for the same state or pattern."""
179
+
180
+ blocked: set[str] = set()
181
+ for bucket_name, key in (("states", state_key), ("patterns", pattern_key)):
182
+ actions = self.data.get(bucket_name, {}).get(key, {}).get("actions", {})
183
+ for action_text, stats in actions.items():
184
+ attempts = int(stats.get("attempts", 0))
185
+ failures = int(stats.get("failures", 0))
186
+ successes = int(stats.get("successes", 0))
187
+ progresses = int(stats.get("progresses", 0))
188
+ if attempts >= 2 and failures >= attempts and successes == 0 and progresses == 0:
189
+ blocked.add(action_text)
190
+ return blocked
191
+
192
+
193
+ def log_start(task: str, env: str, model: str) -> None:
194
+ """Emit the required episode start line."""
195
+
196
+ print(f"[START] task={task} env={env} model={model}", flush=True)
197
+
198
+
199
+ def log_step(
200
+ step: int, action: str, reward: float, done: bool, error: Optional[str]
201
+ ) -> None:
202
+ """Emit the required per-step line."""
203
+
204
+ error_value = error if error else "null"
205
+ print(
206
+ f"[STEP] step={step} action={action} reward={reward:.2f} "
207
+ f"done={str(done).lower()} error={error_value}",
208
+ flush=True,
209
+ )
210
+
211
+
212
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
213
+ """Emit the required episode end line."""
214
+
215
+ rewards_text = ",".join(f"{reward:.2f}" for reward in rewards)
216
+ print(
217
+ f"[END] success={str(success).lower()} steps={steps} rewards={rewards_text}",
218
+ flush=True,
219
+ )
220
+
221
+
222
+ def build_history_lines(history: Sequence[str]) -> str:
223
+ """Render the last few steps for the model prompt."""
224
+
225
+ if not history:
226
+ return "None"
227
+ return "\n".join(history[-5:])
228
+
229
+
230
+ def _stable_json(value: Any) -> str:
231
+ """Serialize a value deterministically for memory key generation."""
232
+
233
+ return json.dumps(value, sort_keys=True, separators=(",", ":"))
234
+
235
+
236
+ def _hash_key(payload: Mapping[str, Any]) -> str:
237
+ """Build a compact deterministic memory key."""
238
+
239
+ return hashlib.sha1(_stable_json(payload).encode("utf-8")).hexdigest()
240
+
241
+
242
+ def _normalize_issue_text(issue: str) -> str:
243
+ """Remove row-specific numbers so pattern memory generalizes better."""
244
+
245
+ lowered = issue.lower().strip()
246
+ return re.sub(r"\d+", "#", lowered)
247
+
248
+
249
+ def _table_summary(table: Sequence[Mapping[str, Any]]) -> Dict[str, Any]:
250
+ """Extract a compact problem summary from the visible table state."""
251
+
252
+ present_columns = sorted({key for row in table for key in row.keys() if key != "row_id"})
253
+ missing_counts: Dict[str, int] = {}
254
+ for column in present_columns:
255
+ count = sum(1 for row in table if _is_missing(row.get(column)))
256
+ if count > 0:
257
+ missing_counts[column] = count
258
+
259
+ duplicate_counts: Dict[str, int] = {}
260
+ for column in list(IDENTIFIER_COLUMNS) + ["email"]:
261
+ values = [row.get(column) for row in table if row.get(column) not in (None, "")]
262
+ if values and len(values) != len(set(values)):
263
+ duplicate_counts[column] = len(values) - len(set(values))
264
+
265
+ return {
266
+ "row_count": len(table),
267
+ "present_columns": present_columns,
268
+ "missing_counts": missing_counts,
269
+ "duplicate_counts": duplicate_counts,
270
+ "invalid_email_count": sum(
271
+ 1
272
+ for row in table
273
+ if row.get("email") not in (None, "") and not _is_valid_email(row.get("email"))
274
+ ),
275
+ "invalid_phone_count": sum(
276
+ 1
277
+ for row in table
278
+ if row.get("phone") not in (None, "") and not _is_valid_phone(row.get("phone"))
279
+ ),
280
+ "title_case_columns": sorted(
281
+ column
282
+ for column in ("name", "city")
283
+ if any(_needs_title_case(row.get(column)) for row in table)
284
+ ),
285
+ }
286
+
287
+
288
+ def build_memory_keys(
289
+ task_name: str,
290
+ task_variant: str,
291
+ goal: str,
292
+ observation: Mapping[str, Any],
293
+ ) -> Tuple[str, str]:
294
+ """Build exact-state and generalized problem-pattern keys."""
295
+
296
+ table = list(observation.get("table", []))
297
+ normalized_issues = sorted(_normalize_issue_text(str(issue)) for issue in observation.get("issues", []))
298
+ state_key = _hash_key(
299
+ {
300
+ "task_name": task_name,
301
+ "task_variant": task_variant,
302
+ "goal": goal,
303
+ "table": [
304
+ {key: row.get(key) for key in sorted(row.keys())}
305
+ for row in sorted(table, key=lambda row: int(row.get("row_id", 0)))
306
+ ],
307
+ "issues": normalized_issues,
308
+ }
309
+ )
310
+ pattern_key = _hash_key(
311
+ {
312
+ "task_name": task_name,
313
+ "goal": goal,
314
+ "summary": _table_summary(table),
315
+ "issues": normalized_issues,
316
+ }
317
+ )
318
+ return state_key, pattern_key
319
+
320
+
321
+ def _is_missing(value: Any) -> bool:
322
+ """Return whether a value is missing."""
323
+
324
+ return value is None or value == ""
325
+
326
+
327
+ def _needs_title_case(value: Any) -> bool:
328
+ """Return whether a string still needs title-case normalization."""
329
+
330
+ if not isinstance(value, str):
331
+ return False
332
+ cleaned = value.strip()
333
+ return bool(cleaned) and cleaned != cleaned.title()
334
+
335
+
336
+ def _is_valid_email(value: Any) -> bool:
337
+ """Return whether an email-like string is valid."""
338
+
339
+ return isinstance(value, str) and bool(EMAIL_PATTERN.match(value.strip()))
340
+
341
+
342
+ def _is_valid_phone(value: Any) -> bool:
343
+ """Return whether a phone-like string is valid."""
344
+
345
+ if not isinstance(value, str):
346
+ return False
347
+ digits = re.sub(r"\D", "", value)
348
+ return len(digits) == 10 or (len(digits) == 11 and digits.startswith("1"))
349
+
350
+
351
+ def _slugify_text(value: str) -> str:
352
+ """Convert free text into a stable email-local-part fragment."""
353
+
354
+ lowered = re.sub(r"[^a-z0-9]+", ".", value.lower()).strip(".")
355
+ return lowered or "record"
356
+
357
+
358
+ def _infer_email(row: Mapping[str, Any]) -> str:
359
+ """Infer a safe placeholder email from row context."""
360
+
361
+ if isinstance(row.get("name"), str) and row["name"].strip():
362
+ return f"{_slugify_text(row['name'])}@example.com"
363
+ for key in IDENTIFIER_COLUMNS:
364
+ if row.get(key):
365
+ return f"{str(row[key]).lower()}@example.com"
366
+ return f"row{row.get('row_id', 'unknown')}@example.com"
367
+
368
+
369
+ def _infer_name(row: Mapping[str, Any]) -> str:
370
+ """Infer a readable name when a name field is missing."""
371
+
372
+ email = row.get("email")
373
+ if isinstance(email, str) and "@" in email:
374
+ return email.split("@", 1)[0].replace(".", " ").title()
375
+ for key in IDENTIFIER_COLUMNS:
376
+ if row.get(key):
377
+ return str(row[key]).replace("_", " ").title()
378
+ return "Unknown Record"
379
+
380
+
381
+ def _infer_city(table: Sequence[Mapping[str, Any]]) -> str:
382
+ """Infer a plausible city using the mode of visible values."""
383
+
384
+ candidates = [
385
+ str(row.get("city")).title()
386
+ for row in table
387
+ if isinstance(row.get("city"), str) and row.get("city")
388
+ ]
389
+ if not candidates:
390
+ return "Seattle"
391
+ return Counter(candidates).most_common(1)[0][0]
392
+
393
+
394
+ def _infer_fill_value(
395
+ row: Mapping[str, Any],
396
+ column: str,
397
+ table: Sequence[Mapping[str, Any]],
398
+ ) -> str:
399
+ """Infer a deterministic fill value from local table context."""
400
+
401
+ for key in IDENTIFIER_COLUMNS:
402
+ identifier = row.get(key)
403
+ if not identifier:
404
+ continue
405
+ for candidate in table:
406
+ if candidate.get("row_id") == row.get("row_id"):
407
+ continue
408
+ if candidate.get(key) == identifier and not _is_missing(candidate.get(column)):
409
+ return str(candidate[column])
410
+
411
+ if column == "email":
412
+ return _infer_email(row)
413
+ if column == "city":
414
+ return _infer_city(table)
415
+ if column == "phone":
416
+ return "555-555-0100"
417
+ if column == "status":
418
+ return "active"
419
+ if column == "name":
420
+ return _infer_name(row)
421
+ return "resolved"
422
+
423
+
424
+ def _row_signature(row: Mapping[str, Any]) -> Tuple[Tuple[str, Any], ...]:
425
+ """Create a comparable row signature excluding runtime row identifiers."""
426
+
427
+ return tuple(sorted((key, value) for key, value in row.items() if key != "row_id"))
428
+
429
+
430
+ def _build_action_string(payload: Mapping[str, Any]) -> str:
431
+ """Reconstruct a normalized action string for logging and filtering."""
432
+
433
+ action_type = str(payload["action_type"])
434
+ args: List[str] = []
435
+ for key in ("row_id", "column", "value"):
436
+ if key not in payload or payload[key] is None:
437
+ continue
438
+ value = payload[key]
439
+ if isinstance(value, str):
440
+ args.append(f"{key}='{value}'")
441
+ else:
442
+ args.append(f"{key}={value}")
443
+ return f"{action_type}({', '.join(args)})" if args else f"{action_type}()"
444
+
445
+
446
+ def build_action_string(payload: Dict[str, Any]) -> str:
447
+ """Backward-compatible public wrapper around action string generation."""
448
+
449
+ return _build_action_string(payload)
450
+
451
+
452
+ def parse_model_action(response_text: str) -> str:
453
+ """Extract a single action string from model output."""
454
+
455
+ if not response_text:
456
+ return FALLBACK_ACTION
457
+
458
+ for raw_line in response_text.splitlines():
459
+ line = ACTION_PREFIX_RE.sub("", raw_line.strip())
460
+ if "(" in line and line.endswith(")"):
461
+ return re.sub(r"\s+", " ", line)
462
+
463
+ compact = ACTION_PREFIX_RE.sub("", response_text.strip())
464
+ match = re.search(r"[a-zA-Z_]+\s*\(.*\)", compact)
465
+ if match:
466
+ return re.sub(r"\s+", " ", match.group(0))
467
+
468
+ return FALLBACK_ACTION
469
+
470
+
471
+ def action_string_to_payload(action_str: str, step_number: int) -> Tuple[str, Dict[str, Any]]:
472
+ """Convert a model action string into an environment action payload."""
473
+
474
+ try:
475
+ expression = ast.parse(action_str, mode="eval").body
476
+ except SyntaxError:
477
+ return FALLBACK_ACTION, {"action_id": f"step-{step_number:03d}", "action_type": "noop"}
478
+
479
+ if not isinstance(expression, ast.Call) or not isinstance(expression.func, ast.Name):
480
+ return FALLBACK_ACTION, {"action_id": f"step-{step_number:03d}", "action_type": "noop"}
481
+
482
+ allowed_actions = {
483
+ "remove_duplicate",
484
+ "fill_missing",
485
+ "normalize_column",
486
+ "delete_row",
487
+ "validate",
488
+ "noop",
489
+ }
490
+ action_type = expression.func.id
491
+ if action_type not in allowed_actions:
492
+ return FALLBACK_ACTION, {"action_id": f"step-{step_number:03d}", "action_type": "noop"}
493
+
494
+ payload: Dict[str, Any] = {
495
+ "action_id": f"step-{step_number:03d}",
496
+ "action_type": action_type,
497
+ }
498
+ try:
499
+ for keyword in expression.keywords:
500
+ if keyword.arg is None:
501
+ continue
502
+ payload[keyword.arg] = ast.literal_eval(keyword.value)
503
+ except (SyntaxError, ValueError, TypeError):
504
+ return FALLBACK_ACTION, {"action_id": f"step-{step_number:03d}", "action_type": "noop"}
505
+
506
+ return _build_action_string(payload), payload
507
+
508
+
509
+ def create_client() -> Optional[OpenAI]:
510
+ """Create an OpenAI-compatible client when credentials look real."""
511
+
512
+ if HF_TOKEN in {None, "", "local-test", "test", "dummy"}:
513
+ return None
514
+ return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
515
+
516
+
517
+ def _extract_response_text(content: Any) -> str:
518
+ """Normalize OpenAI response content into plain text."""
519
+
520
+ if isinstance(content, str):
521
+ return content
522
+ if isinstance(content, list):
523
+ return "".join(
524
+ str(part.get("text", ""))
525
+ for part in content
526
+ if isinstance(part, dict)
527
+ )
528
+ return str(content or "")
529
+
530
+
531
+ def _table_preview(table: Sequence[Mapping[str, Any]], limit: int = 6) -> str:
532
+ """Render a compact table preview for prompting."""
533
+
534
+ preview_lines: List[str] = []
535
+ for row in table[:limit]:
536
+ summary = ", ".join(
537
+ f"{key}={value}"
538
+ for key, value in row.items()
539
+ if key in {"row_id", "name", "city", "email", "phone", "status", "customer_id", "vendor_id", "partner_id"}
540
+ )
541
+ preview_lines.append(f"- {summary}")
542
+ return "\n".join(preview_lines) if preview_lines else "- None"
543
+
544
+
545
+ def build_user_prompt(
546
+ step: int,
547
+ goal: str,
548
+ observation: Dict[str, Any],
549
+ history: Sequence[str],
550
+ last_error: Optional[str],
551
+ candidate_actions: Sequence[str],
552
+ blocked_actions: Sequence[str],
553
+ ) -> str:
554
+ """Construct a compact prompt that constrains the model to useful actions."""
555
+
556
+ issues = observation.get("issues", [])
557
+ hints = observation.get("hints", [])
558
+ issues_text = "\n".join(f"- {issue}" for issue in issues[:6]) if issues else "- None"
559
+ hints_text = "\n".join(f"- {hint}" for hint in hints[:3]) if hints else "- None"
560
+ candidates_text = "\n".join(f"- {action}" for action in candidate_actions)
561
+ blocked_text = "\n".join(f"- {action}" for action in blocked_actions[:5]) if blocked_actions else "- None"
562
+
563
+ return textwrap.dedent(
564
+ f"""
565
+ Step: {step}
566
+ Goal: {goal}
567
+ Steps remaining: {observation.get("steps_remaining")}
568
+ Progress: {observation.get("progress")}
569
+ Current issues:
570
+ {issues_text}
571
+ Current hints:
572
+ {hints_text}
573
+ Table preview:
574
+ {_table_preview(observation.get("table", []))}
575
+ Recent history:
576
+ {build_history_lines(history)}
577
+ Last action error: {last_error or "null"}
578
+ Blocked actions:
579
+ {blocked_text}
580
+
581
+ Choose exactly one action from this candidate list:
582
+ {candidates_text}
583
+ """
584
+ ).strip()
585
+
586
+
587
+ def _prefer_action(
588
+ candidates: Sequence[Dict[str, Any]],
589
+ blocked_actions: set[str],
590
+ ) -> Dict[str, Any]:
591
+ """Return the first candidate action that is not blocked."""
592
+
593
+ for candidate in candidates:
594
+ action_text = _build_action_string(candidate)
595
+ if action_text not in blocked_actions:
596
+ return dict(candidate)
597
+ return {"action_type": "validate"}
598
+
599
+
600
+ def _exact_duplicate_candidates(table: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]:
601
+ """Generate explicit remove-duplicate actions for exact duplicate rows."""
602
+
603
+ groups: Dict[Tuple[Tuple[str, Any], ...], List[int]] = defaultdict(list)
604
+ for row in table:
605
+ row_id = row.get("row_id")
606
+ if row_id is None:
607
+ continue
608
+ groups[_row_signature(row)].append(int(row_id))
609
+
610
+ actions: List[Dict[str, Any]] = []
611
+ for row_ids in groups.values():
612
+ if len(row_ids) > 1:
613
+ actions.append({"action_type": "remove_duplicate", "row_id": max(row_ids)})
614
+ return actions
615
+
616
+
617
+ def _group_by_identifier(table: Sequence[Mapping[str, Any]]) -> Dict[Tuple[str, str], List[Dict[str, Any]]]:
618
+ """Group rows by likely business identifiers."""
619
+
620
+ groups: Dict[Tuple[str, str], List[Dict[str, Any]]] = defaultdict(list)
621
+ for row in table:
622
+ for key in IDENTIFIER_COLUMNS:
623
+ value = row.get(key)
624
+ if value not in (None, ""):
625
+ groups[(key, str(value))].append(dict(row))
626
+ return groups
627
+
628
+
629
+ def _row_quality_score(row: Mapping[str, Any]) -> int:
630
+ """Score a row so lower-quality conflict rows can be removed first."""
631
+
632
+ score = 0
633
+ if _is_valid_email(row.get("email")):
634
+ score += 3
635
+ if _is_valid_phone(row.get("phone")) or row.get("phone") in (None, ""):
636
+ score += 2
637
+ if isinstance(row.get("status"), str) and row.get("status") == "active":
638
+ score += 1
639
+ if isinstance(row.get("name"), str) and row.get("name").strip():
640
+ score += 1
641
+ return score
642
+
643
+
644
+ def _structural_delete_candidates(table: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]:
645
+ """Generate delete actions for non-exact structural conflicts."""
646
+
647
+ actions: List[Dict[str, Any]] = []
648
+ for rows in _group_by_identifier(table).values():
649
+ if len(rows) < 2:
650
+ continue
651
+ signatures = {_row_signature(row) for row in rows}
652
+ if len(signatures) == 1:
653
+ continue
654
+ worst_row = sorted(
655
+ rows,
656
+ key=lambda row: (_row_quality_score(row), int(row.get("row_id", 0))),
657
+ )[0]
658
+ actions.append({"action_type": "delete_row", "row_id": int(worst_row["row_id"])})
659
+
660
+ email_groups: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
661
+ for row in table:
662
+ email = row.get("email")
663
+ if email not in (None, ""):
664
+ email_groups[str(email)].append(dict(row))
665
+ for rows in email_groups.values():
666
+ if len(rows) < 2:
667
+ continue
668
+ worst_row = sorted(
669
+ rows,
670
+ key=lambda row: (_row_quality_score(row), int(row.get("row_id", 0))),
671
+ )[0]
672
+ action = {"action_type": "delete_row", "row_id": int(worst_row["row_id"])}
673
+ if action not in actions:
674
+ actions.append(action)
675
+ return actions
676
+
677
+
678
+ def _missing_value_candidates(table: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]:
679
+ """Generate candidate fill actions for visible missing values."""
680
+
681
+ present_columns = {key for row in table for key in row.keys()}
682
+ priorities = [
683
+ column
684
+ for column in ["email", "city", "phone", "status", "name"]
685
+ if column in present_columns
686
+ ]
687
+ actions: List[Dict[str, Any]] = []
688
+ for column in priorities:
689
+ for row in table:
690
+ if _is_missing(row.get(column)):
691
+ actions.append(
692
+ {
693
+ "action_type": "fill_missing",
694
+ "row_id": int(row["row_id"]),
695
+ "column": column,
696
+ "value": _infer_fill_value(row, column, table),
697
+ }
698
+ )
699
+ return actions
700
+
701
+
702
+ def _normalization_candidates(table: Sequence[Mapping[str, Any]]) -> List[Dict[str, Any]]:
703
+ """Generate candidate column-normalization actions."""
704
+
705
+ candidates: List[Dict[str, Any]] = []
706
+ if any(row.get("email") not in (None, "") and not _is_valid_email(row.get("email")) for row in table):
707
+ candidates.append({"action_type": "normalize_column", "column": "email"})
708
+ if any(row.get("phone") not in (None, "") and not _is_valid_phone(row.get("phone")) for row in table):
709
+ candidates.append({"action_type": "normalize_column", "column": "phone"})
710
+ if any(_needs_title_case(row.get("name")) for row in table):
711
+ candidates.append({"action_type": "normalize_column", "column": "name"})
712
+ if any(_needs_title_case(row.get("city")) for row in table):
713
+ candidates.append({"action_type": "normalize_column", "column": "city"})
714
+ return candidates
715
+
716
+
717
+ def propose_candidate_actions(
718
+ observation: Mapping[str, Any],
719
+ blocked_actions: set[str],
720
+ ) -> List[Dict[str, Any]]:
721
+ """Generate ranked candidate actions from visible table state."""
722
+
723
+ table = list(observation.get("table", []))
724
+ candidates = (
725
+ _exact_duplicate_candidates(table)
726
+ + _structural_delete_candidates(table)
727
+ + _missing_value_candidates(table)
728
+ + _normalization_candidates(table)
729
+ + [{"action_type": "validate"}]
730
+ + [{"action_type": "noop"}]
731
+ )
732
+
733
+ unique_candidates: List[Dict[str, Any]] = []
734
+ seen: set[str] = set()
735
+ for candidate in candidates:
736
+ action_text = _build_action_string(candidate)
737
+ if action_text in seen:
738
+ continue
739
+ seen.add(action_text)
740
+ unique_candidates.append(candidate)
741
+
742
+ preferred = _prefer_action(unique_candidates, blocked_actions)
743
+ preferred_text = _build_action_string(preferred)
744
+ ordered = [preferred] + [
745
+ candidate
746
+ for candidate in unique_candidates
747
+ if _build_action_string(candidate) != preferred_text
748
+ ]
749
+ return ordered[:8]
750
+
751
+
752
+ def _order_candidates_with_memory(
753
+ candidates: Sequence[Dict[str, Any]],
754
+ memory: PolicyMemory,
755
+ state_key: str,
756
+ pattern_key: str,
757
+ ) -> List[Dict[str, Any]]:
758
+ """Re-rank candidates using persistent cross-episode memory."""
759
+
760
+ scored = []
761
+ for index, candidate in enumerate(candidates):
762
+ action_text = _build_action_string(candidate)
763
+ scored.append(
764
+ (
765
+ -memory.score_action(state_key, pattern_key, action_text),
766
+ index,
767
+ dict(candidate),
768
+ )
769
+ )
770
+ scored.sort(key=lambda item: (item[0], item[1]))
771
+ return [item[2] for item in scored]
772
+
773
+
774
+ def model_action(
775
+ client: Optional[OpenAI],
776
+ model_name: str,
777
+ step: int,
778
+ goal: str,
779
+ observation: Dict[str, Any],
780
+ history: Sequence[str],
781
+ last_error: Optional[str],
782
+ candidate_actions: Sequence[str],
783
+ blocked_actions: Sequence[str],
784
+ ) -> Optional[str]:
785
+ """Ask the model to choose among pre-computed candidate actions."""
786
+
787
+ if client is None:
788
+ return None
789
+
790
+ prompt = build_user_prompt(
791
+ step=step,
792
+ goal=goal,
793
+ observation=observation,
794
+ history=history,
795
+ last_error=last_error,
796
+ candidate_actions=candidate_actions,
797
+ blocked_actions=blocked_actions,
798
+ )
799
+ current_prompt = prompt
800
+ candidate_set = set(candidate_actions)
801
+ for _ in range(MODEL_RETRIES):
802
+ try:
803
+ completion = client.chat.completions.create(
804
+ model=model_name,
805
+ messages=[
806
+ {
807
+ "role": "system",
808
+ "content": [{"type": "text", "text": SYSTEM_PROMPT}],
809
+ },
810
+ {
811
+ "role": "user",
812
+ "content": [{"type": "text", "text": current_prompt}],
813
+ },
814
+ ],
815
+ temperature=TEMPERATURE,
816
+ max_tokens=MAX_TOKENS,
817
+ stream=False,
818
+ )
819
+ response_text = _extract_response_text(completion.choices[0].message.content)
820
+ action_text = parse_model_action(response_text)
821
+ if action_text in candidate_set and action_text not in set(blocked_actions):
822
+ return action_text
823
+ current_prompt = (
824
+ prompt
825
+ + "\n\nYour previous answer was invalid or blocked. Choose exactly one action from the candidate list."
826
+ )
827
+ except Exception: # noqa: BLE001
828
+ return None
829
+ return None
830
+
831
+
832
+ def choose_action(
833
+ client: Optional[OpenAI],
834
+ memory: PolicyMemory,
835
+ task_name: str,
836
+ task_variant: str,
837
+ observation: Dict[str, Any],
838
+ goal: str,
839
+ step_number: int,
840
+ history: Sequence[str],
841
+ last_error: Optional[str],
842
+ blocked_actions: set[str],
843
+ ) -> Tuple[str, Dict[str, Any], str, str, str]:
844
+ """Choose the next action using a heuristic planner with optional model arbitration."""
845
+
846
+ state_key, pattern_key = build_memory_keys(task_name, task_variant, goal, observation)
847
+ memory_blocked = memory.blocked_actions(state_key, pattern_key)
848
+ combined_blocked = set(blocked_actions) | set(memory_blocked)
849
+ candidates = propose_candidate_actions(observation, combined_blocked)
850
+ candidates = _order_candidates_with_memory(candidates, memory, state_key, pattern_key)
851
+ heuristic_candidate = candidates[0]
852
+ heuristic_text = _build_action_string(heuristic_candidate)
853
+ candidate_texts = [_build_action_string(candidate) for candidate in candidates]
854
+
855
+ model_text = model_action(
856
+ client=client,
857
+ model_name=MODEL_NAME,
858
+ step=step_number,
859
+ goal=goal,
860
+ observation=observation,
861
+ history=history,
862
+ last_error=last_error,
863
+ candidate_actions=candidate_texts,
864
+ blocked_actions=sorted(combined_blocked),
865
+ )
866
+
867
+ chosen_text = model_text or heuristic_text
868
+ normalized_text, payload = action_string_to_payload(chosen_text, step_number)
869
+ if normalized_text in combined_blocked:
870
+ normalized_text, payload = action_string_to_payload(heuristic_text, step_number)
871
+ return normalized_text, payload, "heuristic", state_key, pattern_key
872
+ return normalized_text, payload, "model" if model_text else "heuristic", state_key, pattern_key
873
+
874
+
875
+ def run_episode(
876
+ client: Optional[OpenAI],
877
+ memory: PolicyMemory,
878
+ task_name: str,
879
+ seed: int,
880
+ ) -> float:
881
+ """Run one deterministic task episode and return its final task score."""
882
+
883
+ env = DataOpsEnv(seed=seed, task_name=task_name)
884
+ rewards: List[float] = []
885
+ history: List[str] = []
886
+ blocked_actions: set[str] = set()
887
+ steps_taken = 0
888
+ success = False
889
+ last_error: Optional[str] = None
890
+ final_score = 0.0
891
+ task_variant = "unknown"
892
+
893
+ try:
894
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
895
+ observation_model = env.reset()
896
+ observation = observation_model.model_dump()
897
+ task_variant = str(env.state().get("task_variant", "unknown"))
898
+
899
+ for step_number in range(1, MAX_STEPS + 1):
900
+ action_text, action_payload, action_source, state_key, pattern_key = choose_action(
901
+ client=client,
902
+ memory=memory,
903
+ task_name=task_name,
904
+ task_variant=task_variant,
905
+ observation=observation,
906
+ goal=observation_model.goal,
907
+ step_number=step_number,
908
+ history=history,
909
+ last_error=last_error,
910
+ blocked_actions=blocked_actions,
911
+ )
912
+
913
+ try:
914
+ observation_model, reward, done, info = env.step(action_payload)
915
+ observation = observation_model.model_dump()
916
+ result = info.get("result", {})
917
+ progress_delta = float(result.get("progress_delta", 0.0))
918
+ error_value = result.get("error_type") or info.get("error") or None
919
+ final_score = float(info.get("task_score", 0.0))
920
+ if error_value == "general":
921
+ error_value = None
922
+ memory.update(
923
+ state_key=state_key,
924
+ pattern_key=pattern_key,
925
+ action_text=action_text,
926
+ reward=reward,
927
+ progress_delta=progress_delta,
928
+ error=error_value,
929
+ done=done,
930
+ task_score=final_score,
931
+ )
932
+ if error_value or progress_delta == 0.0 or reward <= 0.0:
933
+ blocked_actions.add(action_text)
934
+ except Exception as exc: # noqa: BLE001
935
+ reward = 0.0
936
+ done = True
937
+ info = {}
938
+ error_value = str(exc)
939
+ blocked_actions.add(action_text)
940
+ memory.update(
941
+ state_key=state_key,
942
+ pattern_key=pattern_key,
943
+ action_text=action_text,
944
+ reward=reward,
945
+ progress_delta=0.0,
946
+ error=error_value,
947
+ done=done,
948
+ task_score=final_score,
949
+ )
950
+
951
+ rewards.append(reward)
952
+ steps_taken = step_number
953
+ last_error = error_value
954
+ log_step(
955
+ step=step_number,
956
+ action=action_text,
957
+ reward=reward,
958
+ done=done,
959
+ error=error_value,
960
+ )
961
+
962
+ history.append(
963
+ f"step={step_number} source={action_source} action={action_text} "
964
+ f"reward={reward:.2f} done={str(done).lower()} error={error_value or 'null'}"
965
+ )
966
+
967
+ if done:
968
+ success = bool(final_score >= 0.95 and error_value is None)
969
+ break
970
+ finally:
971
+ memory.save()
972
+ close_method = getattr(env, "close", None)
973
+ if callable(close_method):
974
+ close_method()
975
+ log_end(success=success, steps=steps_taken, rewards=rewards)
976
+ return final_score
977
+
978
+
979
+ def main() -> None:
980
+ """Run all benchmark tasks with deterministic ordering and stdout formatting."""
981
+
982
+ client = create_client()
983
+ memory = PolicyMemory(POLICY_CACHE_PATH)
984
+ for task_index, task_name in enumerate(TASK_ORDER):
985
+ run_episode(client=client, memory=memory, task_name=task_name, seed=task_index)
986
+
987
+
988
+ if __name__ == "__main__":
989
+ main()
models.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared data models for ``dataops-gym``.
2
+
3
+ This module is responsible for defining typed request, response, and domain
4
+ schemas used across task execution, inference, grading, and server layers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List, Literal, Optional
10
+
11
+ from pydantic import BaseModel, Field, root_validator
12
+
13
+
14
+ class Action(BaseModel):
15
+ """Represents a single environment action issued by an agent or client."""
16
+
17
+ class Config:
18
+ arbitrary_types_allowed = True
19
+ extra = "forbid"
20
+
21
+ action_id: Optional[str] = Field(
22
+ default=None,
23
+ description="A unique identifier for the action instance, useful for tracking repeated actions and mistake patterns.",
24
+ )
25
+ action_type: Literal[
26
+ "remove_duplicate",
27
+ "fill_missing",
28
+ "normalize_column",
29
+ "delete_row",
30
+ "validate",
31
+ "noop",
32
+ ] = Field(
33
+ ...,
34
+ description="The type of data-cleaning action to apply in the environment.",
35
+ )
36
+ column: Optional[str] = Field(
37
+ default=None,
38
+ description="Optional target column name associated with the action.",
39
+ )
40
+ row_id: Optional[int] = Field(
41
+ default=None,
42
+ description="Optional target row identifier associated with the action.",
43
+ )
44
+ value: Optional[str] = Field(
45
+ default=None,
46
+ description="Optional value payload used by the action when needed.",
47
+ )
48
+
49
+ @root_validator(skip_on_failure=True)
50
+ def validate_action_requirements(cls, values: Dict[str, Any]) -> Dict[str, Any]:
51
+ """Enforce conditional field requirements for specific action types."""
52
+ action_type = values.get("action_type")
53
+ column = values.get("column")
54
+ row_id = values.get("row_id")
55
+ value = values.get("value")
56
+
57
+ if action_type == "delete_row" and row_id is None:
58
+ raise ValueError("row_id must not be None when action_type is 'delete_row'")
59
+ if action_type == "normalize_column" and column is None:
60
+ raise ValueError("column must not be None when action_type is 'normalize_column'")
61
+ if action_type == "fill_missing" and (column is None or value is None):
62
+ raise ValueError(
63
+ "column and value must not be None when action_type is 'fill_missing'"
64
+ )
65
+
66
+ return values
67
+
68
+
69
+ class Observation(BaseModel):
70
+ """Represents the current observable state returned by the environment."""
71
+
72
+ class Config:
73
+ arbitrary_types_allowed = True
74
+ extra = "forbid"
75
+
76
+ goal: str = Field(
77
+ ...,
78
+ description="A natural language description of the task objective the agent must achieve.",
79
+ )
80
+ table: List[Dict[str, Any]] = Field(
81
+ ...,
82
+ description="JSON-serializable table snapshot represented as a list of row dictionaries.",
83
+ )
84
+ issues: List[str] = Field(
85
+ ...,
86
+ description="Detected data-quality issues currently present in the table.",
87
+ )
88
+ history: List[str] = Field(
89
+ ...,
90
+ description="Ordered list of previously applied actions or events.",
91
+ )
92
+ mistakes: Dict[str, int] = Field(
93
+ ...,
94
+ description="Counts of mistake categories accumulated during the episode.",
95
+ )
96
+ hints: List[str] = Field(
97
+ ...,
98
+ description="Optional guidance hints available to the agent or client.",
99
+ )
100
+ progress: float = Field(
101
+ ...,
102
+ ge=0.0,
103
+ le=1.0,
104
+ description="A normalized estimate (0.0–1.0) of how much of the task is completed.",
105
+ )
106
+ steps_remaining: int = Field(
107
+ ...,
108
+ description="Number of steps remaining before the episode terminates.",
109
+ )
110
+
111
+
112
+ class Reward(BaseModel):
113
+ """Represents the reward outcome associated with an environment step."""
114
+
115
+ class Config:
116
+ arbitrary_types_allowed = True
117
+ extra = "forbid"
118
+
119
+ reward: float = Field(
120
+ ...,
121
+ description="Numeric reward assigned to the most recent action or transition.",
122
+ )
123
+ reason: str = Field(
124
+ ...,
125
+ description="Human-readable explanation for why the reward was assigned.",
126
+ )
127
+ components: Dict[str, float] = Field(
128
+ ...,
129
+ description="Breakdown of reward contributions (e.g., duplicate_removal: 0.3, penalty: -0.1)",
130
+ )
openenv.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenEnv configuration for `dataops-env`.
2
+ # Responsibility: declare minimal environment metadata and task registration.
3
+
4
+ name: dataops-env
5
+ description: Multi-step enterprise data cleaning environment
6
+ version: "1.0.0"
7
+ runtime:
8
+ python: "3.10+"
9
+ entrypoint: env:DataOpsEnv
10
+ tasks:
11
+ easy:
12
+ factory: task:easy_cleaning_task
13
+ medium:
14
+ factory: task:medium_normalization_task
15
+ hard:
16
+ factory: task:hard_conflict_resolution_task
pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dataops-env"
7
+ version = "1.0.0"
8
+ description = "Multi-step enterprise data cleaning OpenEnv environment."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "fastapi",
13
+ "numpy",
14
+ "openenv-core>=0.2.0",
15
+ "openai",
16
+ "pydantic",
17
+ "uvicorn",
18
+ ]
19
+
20
+ [project.scripts]
21
+ server = "server.app:main"
22
+
23
+ [tool.setuptools]
24
+ py-modules = ["env", "grader", "inference", "models", "task"]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ numpy
3
+ openenv-core>=0.2.0
4
+ openai
5
+ pydantic
6
+ uvicorn
server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Server package for the dataops OpenEnv environment."""
server/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (212 Bytes). View file
 
server/__pycache__/app.cpython-313.pyc ADDED
Binary file (7.12 kB). View file
 
server/app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application server bootstrap for ``dataops-gym``.
2
+
3
+ This module is responsible for exposing runtime APIs, health endpoints, and
4
+ deployment-facing application setup for the environment.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import os
11
+ from pathlib import Path
12
+ import sys
13
+ from threading import RLock
14
+ from typing import Any, Dict, Optional
15
+
16
+ from fastapi import FastAPI, HTTPException, Request
17
+ from fastapi.responses import JSONResponse
18
+ from pydantic import BaseModel, Field
19
+ import uvicorn
20
+
21
+
22
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
23
+ if str(PROJECT_ROOT) not in sys.path:
24
+ sys.path.insert(0, str(PROJECT_ROOT))
25
+
26
+ from env import DataOpsEnv
27
+ from models import Action
28
+
29
+
30
+ logging.basicConfig(level=logging.INFO)
31
+ logger = logging.getLogger(__name__)
32
+
33
+ app = FastAPI(title="dataops-env", version="1.0.0")
34
+ active_env: Optional[DataOpsEnv] = None
35
+ active_env_lock = RLock()
36
+
37
+
38
+ class ResetRequest(BaseModel):
39
+ """Optional reset controls for reproducible task selection."""
40
+
41
+ seed: int = Field(default=0, description="Deterministic seed for task sampling.")
42
+ task_name: str | None = Field(
43
+ default=None,
44
+ description="Optional fixed task name: easy, medium, or hard.",
45
+ )
46
+
47
+
48
+ @app.exception_handler(Exception)
49
+ async def unhandled_exception_handler(
50
+ request: Request, exc: Exception
51
+ ) -> JSONResponse:
52
+ """Return a safe error payload for unexpected server failures."""
53
+
54
+ logger.exception("Unhandled server error on %s", request.url.path, exc_info=exc)
55
+ return JSONResponse(
56
+ status_code=500,
57
+ content={"detail": "Internal server error"},
58
+ )
59
+
60
+
61
+ @app.get("/health")
62
+ def health() -> Dict[str, str]:
63
+ """Return a lightweight deployment health signal."""
64
+
65
+ return {"status": "healthy"}
66
+
67
+
68
+ @app.post("/reset")
69
+ def reset(payload: ResetRequest | None = None) -> Dict[str, Any]:
70
+ """Reset the environment and return the initial observation."""
71
+
72
+ try:
73
+ request = payload or ResetRequest()
74
+ env = DataOpsEnv(seed=request.seed, task_name=request.task_name)
75
+ observation = env.reset()
76
+
77
+ global active_env
78
+ with active_env_lock:
79
+ previous_env = active_env
80
+ active_env = env
81
+
82
+ return {
83
+ "task_name": env.state().get("task_name"),
84
+ "observation": observation.model_dump(),
85
+ }
86
+ except Exception as exc:
87
+ logger.exception("Failed to reset environment", exc_info=exc)
88
+ raise HTTPException(status_code=500, detail="Failed to reset environment") from exc
89
+ finally:
90
+ if "previous_env" in locals() and previous_env is not None:
91
+ close_method = getattr(previous_env, "close", None)
92
+ if callable(close_method):
93
+ close_method()
94
+
95
+
96
+ @app.post("/step")
97
+ def step(action: Action) -> Dict[str, Any]:
98
+ """Apply a single action to the environment and return the step result."""
99
+
100
+ try:
101
+ with active_env_lock:
102
+ if active_env is None:
103
+ raise HTTPException(
104
+ status_code=400,
105
+ detail="Environment not initialized. Call /reset first.",
106
+ )
107
+ observation, reward, done, info = active_env.step(action)
108
+
109
+ return {
110
+ "observation": observation.model_dump(),
111
+ "reward": reward,
112
+ "done": done,
113
+ "info": info,
114
+ }
115
+ except HTTPException:
116
+ raise
117
+ except RuntimeError as exc:
118
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
119
+ except ValueError as exc:
120
+ raise HTTPException(status_code=422, detail=str(exc)) from exc
121
+ except Exception as exc:
122
+ logger.exception("Failed to execute environment step", exc_info=exc)
123
+ raise HTTPException(status_code=500, detail="Failed to execute step") from exc
124
+
125
+
126
+ @app.get("/state")
127
+ def state() -> Dict[str, Any]:
128
+ """Return the current internal environment state as JSON."""
129
+
130
+ try:
131
+ with active_env_lock:
132
+ if active_env is None:
133
+ raise HTTPException(
134
+ status_code=400,
135
+ detail="Environment not initialized. Call /reset first.",
136
+ )
137
+ return active_env.state()
138
+ except HTTPException:
139
+ raise
140
+ except Exception as exc:
141
+ logger.exception("Failed to fetch environment state", exc_info=exc)
142
+ raise HTTPException(status_code=500, detail="Failed to fetch state") from exc
143
+
144
+
145
+ def main() -> None:
146
+ """Run the FastAPI application with uvicorn."""
147
+
148
+ uvicorn.run(
149
+ "server.app:app",
150
+ host="0.0.0.0",
151
+ port=int(os.getenv("PORT", "7860")),
152
+ )
153
+
154
+
155
+ if __name__ == "__main__":
156
+ main()
task.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task definitions for ``dataops-gym``.
2
+
3
+ This module defines the benchmark scenarios used by the OpenEnv environment.
4
+ Each public task family keeps the hackathon-facing `easy` / `medium` / `hard`
5
+ shape while internally supporting deterministic variants so the benchmark is
6
+ broader and less gameable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Dict, List, TypedDict
12
+
13
+
14
+ TableRow = Dict[str, Any]
15
+
16
+
17
+ class TaskDefinition(TypedDict):
18
+ """Typed structure returned by task factory functions."""
19
+
20
+ initial_table: List[TableRow]
21
+ hidden_issues: List[Dict[str, Any]]
22
+ constraints: List[str]
23
+ max_steps: int
24
+ goal: str
25
+ difficulty: str
26
+ required_columns: List[str]
27
+ expected_outcome: Dict[str, Any]
28
+ variant_id: str
29
+
30
+
31
+ class HiddenIssue(TypedDict, total=False):
32
+ """Structured hidden issue description for a task table."""
33
+
34
+ type: str
35
+ rows: List[int]
36
+ row: int
37
+ column: str
38
+ constraint: str
39
+ description: str
40
+
41
+
42
+ def _pick_variant(variant: int | None, variants: List[TaskDefinition]) -> TaskDefinition:
43
+ """Select a deterministic task variant with a stable default."""
44
+
45
+ index = 0 if variant is None else max(0, min(len(variants) - 1, int(variant)))
46
+ return variants[index]
47
+
48
+
49
+ def easy_cleaning_task(variant: int | None = None) -> TaskDefinition:
50
+ """Create an easy multi-step cleaning task with duplicates and missing data."""
51
+
52
+ variants: List[TaskDefinition] = [
53
+ {
54
+ "goal": "Clean the dataset by removing duplicates and filling missing values.",
55
+ "difficulty": "easy",
56
+ "variant_id": "easy_customer_master",
57
+ "required_columns": ["name", "city", "email"],
58
+ "expected_outcome": {
59
+ "expected_row_count": 4,
60
+ "required_non_null_columns": ["name", "city", "email"],
61
+ "unique_by": ["customer_id"],
62
+ "exactly_one_of_rows": [[2, 3]],
63
+ "validation_rules": [
64
+ "Exactly one of rows 2 or 3 should remain after deduplication.",
65
+ "No remaining row should have null values in name, city, or email.",
66
+ "All remaining customer_id values should be unique.",
67
+ ],
68
+ },
69
+ "initial_table": [
70
+ {"row_id": 1, "customer_id": "C001", "name": "Alice Wong", "city": "Seattle", "email": "alice@example.com"},
71
+ {"row_id": 2, "customer_id": "C002", "name": "Ben Ortiz", "city": None, "email": "ben@example.com"},
72
+ {"row_id": 3, "customer_id": "C002", "name": "Ben Ortiz", "city": None, "email": "ben@example.com"},
73
+ {"row_id": 4, "customer_id": "C003", "name": "Carla Singh", "city": "Austin", "email": None},
74
+ {"row_id": 5, "customer_id": "C004", "name": "Drew Park", "city": "Boston", "email": "drew@example.com"},
75
+ ],
76
+ "hidden_issues": [
77
+ {
78
+ "type": "duplicate",
79
+ "rows": [2, 3],
80
+ "description": "Rows 2 and 3 are duplicates and only one should remain.",
81
+ },
82
+ {
83
+ "type": "missing_value",
84
+ "row": 2,
85
+ "column": "city",
86
+ "description": "Row 2 is missing a required city value.",
87
+ },
88
+ {
89
+ "type": "missing_value",
90
+ "row": 4,
91
+ "column": "email",
92
+ "description": "Row 4 is missing a required email value.",
93
+ },
94
+ ],
95
+ "constraints": [
96
+ "Keep one representative row for each real customer.",
97
+ "Do not delete rows solely because they contain missing values.",
98
+ "Name, city, and email must be populated for every remaining row.",
99
+ ],
100
+ "max_steps": 7,
101
+ },
102
+ {
103
+ "goal": "Clean the dataset by removing duplicates and filling missing values.",
104
+ "difficulty": "easy",
105
+ "variant_id": "easy_vendor_onboarding",
106
+ "required_columns": ["name", "city", "email"],
107
+ "expected_outcome": {
108
+ "expected_row_count": 4,
109
+ "required_non_null_columns": ["name", "city", "email"],
110
+ "unique_by": ["vendor_id"],
111
+ "exactly_one_of_rows": [[32, 33]],
112
+ "validation_rules": [
113
+ "Exactly one of rows 32 or 33 should remain after deduplication.",
114
+ "No remaining row should have null values in name, city, or email.",
115
+ "All remaining vendor_id values should be unique.",
116
+ ],
117
+ },
118
+ "initial_table": [
119
+ {"row_id": 31, "vendor_id": "V001", "name": "Northwind Foods", "city": "Denver", "email": "ops@northwind.example.com"},
120
+ {"row_id": 32, "vendor_id": "V002", "name": "Blue Harbor Ltd", "city": "Miami", "email": "contact@blueharbor.example.com"},
121
+ {"row_id": 33, "vendor_id": "V002", "name": "Blue Harbor Ltd", "city": "Miami", "email": "contact@blueharbor.example.com"},
122
+ {"row_id": 34, "vendor_id": "V003", "name": "Atlas Office Supply", "city": None, "email": "service@atlas.example.com"},
123
+ {"row_id": 35, "vendor_id": "V004", "name": "Peak Systems", "city": "Portland", "email": None},
124
+ ],
125
+ "hidden_issues": [
126
+ {
127
+ "type": "duplicate",
128
+ "rows": [32, 33],
129
+ "description": "Rows 32 and 33 are duplicates and only one should remain.",
130
+ },
131
+ {
132
+ "type": "missing_value",
133
+ "row": 34,
134
+ "column": "city",
135
+ "description": "Row 34 is missing a required city value.",
136
+ },
137
+ {
138
+ "type": "missing_value",
139
+ "row": 35,
140
+ "column": "email",
141
+ "description": "Row 35 is missing a required email value.",
142
+ },
143
+ ],
144
+ "constraints": [
145
+ "Keep one representative row for each real vendor.",
146
+ "Do not delete rows solely because they contain missing values.",
147
+ "Name, city, and email must be populated for every remaining row.",
148
+ ],
149
+ "max_steps": 7,
150
+ },
151
+ ]
152
+ return _pick_variant(variant, variants)
153
+
154
+
155
+ def medium_normalization_task(variant: int | None = None) -> TaskDefinition:
156
+ """Create a medium multi-step normalization task with several issue types."""
157
+
158
+ variants: List[TaskDefinition] = [
159
+ {
160
+ "goal": "Normalize the dataset by fixing casing, removing duplicates, and correcting invalid email formats.",
161
+ "difficulty": "medium",
162
+ "variant_id": "medium_customer_normalization",
163
+ "required_columns": ["name", "city", "email"],
164
+ "expected_outcome": {
165
+ "expected_row_count": 5,
166
+ "required_non_null_columns": ["name", "city", "email"],
167
+ "unique_by": ["customer_id"],
168
+ "normalized_columns": {"name": "title_case", "city": "title_case"},
169
+ "format_rules": {"email": "valid_email"},
170
+ "exactly_one_of_rows": [[11, 13]],
171
+ "validation_rules": [
172
+ "Exactly one of rows 11 or 13 should remain after deduplication.",
173
+ "All remaining emails should satisfy a valid email format.",
174
+ "Names and cities should follow a consistent human-readable casing convention.",
175
+ "All remaining customer_id values should be unique.",
176
+ ],
177
+ },
178
+ "initial_table": [
179
+ {"row_id": 10, "customer_id": "C100", "name": "jane miller", "city": "new york", "email": "jane.miller@example.com"},
180
+ {"row_id": 11, "customer_id": "C101", "name": "OMAR HASSAN", "city": "CHICAGO", "email": "omar.hassan[at]example.com"},
181
+ {"row_id": 12, "customer_id": "C102", "name": "Priya Nair", "city": "San Jose", "email": "priya.nair@example.com"},
182
+ {"row_id": 13, "customer_id": "C101", "name": "OMAR HASSAN", "city": "CHICAGO", "email": "omar.hassan[at]example.com"},
183
+ {"row_id": 14, "customer_id": "C103", "name": "li wei", "city": "seattle", "email": "li.wei.example.com"},
184
+ {"row_id": 15, "customer_id": "C104", "name": "Maria Gomez", "city": "Austin", "email": "maria.gomez@example.com"},
185
+ ],
186
+ "hidden_issues": [
187
+ {
188
+ "type": "duplicate",
189
+ "rows": [11, 13],
190
+ "description": "Rows 11 and 13 are duplicates and only one should remain.",
191
+ },
192
+ {
193
+ "type": "inconsistent_casing",
194
+ "rows": [10, 11, 14],
195
+ "column": "name",
196
+ "description": "Rows 10, 11, and 14 contain inconsistent casing in names.",
197
+ },
198
+ {
199
+ "type": "inconsistent_casing",
200
+ "rows": [10, 11, 14],
201
+ "column": "city",
202
+ "description": "Rows 10, 11, and 14 contain inconsistent casing in cities.",
203
+ },
204
+ {
205
+ "type": "invalid_format",
206
+ "row": 11,
207
+ "column": "email",
208
+ "description": "Row 11 contains an invalid email format.",
209
+ },
210
+ {
211
+ "type": "invalid_format",
212
+ "row": 14,
213
+ "column": "email",
214
+ "description": "Row 14 contains an invalid email format.",
215
+ },
216
+ ],
217
+ "constraints": [
218
+ "Preserve the original entity identity of each remaining row.",
219
+ "Normalize names and cities to a consistent human-readable casing style.",
220
+ "Only repair emails that are actually invalid.",
221
+ "Do not introduce new duplicates while normalizing values.",
222
+ ],
223
+ "max_steps": 9,
224
+ },
225
+ {
226
+ "goal": "Normalize the dataset by fixing casing, removing duplicates, and correcting invalid email formats.",
227
+ "difficulty": "medium",
228
+ "variant_id": "medium_partner_directory",
229
+ "required_columns": ["name", "city", "email"],
230
+ "expected_outcome": {
231
+ "expected_row_count": 5,
232
+ "required_non_null_columns": ["name", "city", "email"],
233
+ "unique_by": ["partner_id"],
234
+ "normalized_columns": {"name": "title_case", "city": "title_case"},
235
+ "format_rules": {"email": "valid_email"},
236
+ "exactly_one_of_rows": [[41, 43]],
237
+ "validation_rules": [
238
+ "Exactly one of rows 41 or 43 should remain after deduplication.",
239
+ "All remaining emails should satisfy a valid email format.",
240
+ "Names and cities should use consistent title case.",
241
+ "All remaining partner_id values should be unique.",
242
+ ],
243
+ },
244
+ "initial_table": [
245
+ {"row_id": 40, "partner_id": "P100", "name": "delta analytics", "city": "san francisco", "email": "hello@delta.example.com"},
246
+ {"row_id": 41, "partner_id": "P101", "name": "LUCIA ROMERO", "city": "MADRID", "email": "lucia.romero at example.com"},
247
+ {"row_id": 42, "partner_id": "P102", "name": "Ken Ito", "city": "Tokyo", "email": "ken.ito@example.com"},
248
+ {"row_id": 43, "partner_id": "P101", "name": "LUCIA ROMERO", "city": "MADRID", "email": "lucia.romero at example.com"},
249
+ {"row_id": 44, "partner_id": "P103", "name": "amina ali", "city": "dubai", "email": "amina.ali.example.com"},
250
+ {"row_id": 45, "partner_id": "P104", "name": "Sofia Hart", "city": "London", "email": "sofia.hart@example.com"},
251
+ ],
252
+ "hidden_issues": [
253
+ {
254
+ "type": "duplicate",
255
+ "rows": [41, 43],
256
+ "description": "Rows 41 and 43 are duplicates and only one should remain.",
257
+ },
258
+ {
259
+ "type": "inconsistent_casing",
260
+ "rows": [40, 41, 44],
261
+ "column": "name",
262
+ "description": "Rows 40, 41, and 44 contain inconsistent casing in names.",
263
+ },
264
+ {
265
+ "type": "inconsistent_casing",
266
+ "rows": [40, 41, 44],
267
+ "column": "city",
268
+ "description": "Rows 40, 41, and 44 contain inconsistent casing in cities.",
269
+ },
270
+ {
271
+ "type": "invalid_format",
272
+ "row": 41,
273
+ "column": "email",
274
+ "description": "Row 41 contains an invalid email format.",
275
+ },
276
+ {
277
+ "type": "invalid_format",
278
+ "row": 44,
279
+ "column": "email",
280
+ "description": "Row 44 contains an invalid email format.",
281
+ },
282
+ ],
283
+ "constraints": [
284
+ "Preserve the original entity identity of each remaining row.",
285
+ "Normalize names and cities to a consistent human-readable casing style.",
286
+ "Only repair emails that are actually invalid.",
287
+ "Do not introduce new duplicates while normalizing values.",
288
+ ],
289
+ "max_steps": 9,
290
+ },
291
+ ]
292
+ return _pick_variant(variant, variants)
293
+
294
+
295
+ def hard_conflict_resolution_task(variant: int | None = None) -> TaskDefinition:
296
+ """Create a hard multi-step conflict-resolution task with deceptive records."""
297
+
298
+ variants: List[TaskDefinition] = [
299
+ {
300
+ "goal": "Resolve conflicting records, enforce unique email constraints, fix invalid formats, and preserve valid but unusual data.",
301
+ "difficulty": "hard",
302
+ "variant_id": "hard_customer_conflicts",
303
+ "required_columns": ["name", "email", "phone", "status"],
304
+ "expected_outcome": {
305
+ "expected_row_count_range": {"min": 5, "max": 6},
306
+ "unique_by": ["email"],
307
+ "format_rules": {"email": "valid_email", "phone": "normalized_phone"},
308
+ "exactly_one_of_rows": [[21, 22], [23, 24], [26, 27]],
309
+ "must_preserve_valid_rows": [25, 28],
310
+ "validation_rules": [
311
+ "Exactly one of rows 21 or 22 should remain after deduplication.",
312
+ "Exactly one of rows 23 or 24 should remain after conflict resolution.",
313
+ "Exactly one of rows 26 or 27 should remain after enforcing email uniqueness.",
314
+ "No two remaining rows should share the same email address.",
315
+ "All remaining emails should satisfy a valid email format.",
316
+ "All remaining phone values should be normalized to a consistent valid format.",
317
+ "Rows 25 and 28 should remain logically unchanged because they are valid trap rows.",
318
+ ],
319
+ },
320
+ "initial_table": [
321
+ {"row_id": 21, "customer_id": "C200", "name": "Nina Patel", "email": "nina.patel@example.com", "phone": "206-555-0101", "status": "active"},
322
+ {"row_id": 22, "customer_id": "C200", "name": "Nina Patel", "email": "nina.patel@example.com", "phone": "206-555-0101", "status": "active"},
323
+ {"row_id": 23, "customer_id": "C201", "name": "Evan Cole", "email": "evan.cole@example", "phone": "4155550102", "status": "active"},
324
+ {"row_id": 24, "customer_id": "C201", "name": "Evan Cole", "email": "evan.cole@example.com", "phone": "(415) 555-0102", "status": "inactive"},
325
+ {"row_id": 25, "customer_id": "C202", "name": "A. J. Brown", "email": "aj.brown@example.com", "phone": "+1-312-555-0103", "status": "active"},
326
+ {"row_id": 26, "customer_id": "C203", "name": "Marta Silva", "email": "shared@example.com", "phone": "646-555-0104", "status": "active"},
327
+ {"row_id": 27, "customer_id": "C204", "name": "Martin Silva", "email": "shared@example.com", "phone": "646-555-0105", "status": "active"},
328
+ {"row_id": 28, "customer_id": "C205", "name": "Q Xu", "email": "q.xu+vip@example.com", "phone": "917-555-0106", "status": "active"},
329
+ ],
330
+ "hidden_issues": [
331
+ {
332
+ "type": "duplicate",
333
+ "rows": [21, 22],
334
+ "description": "Rows 21 and 22 are exact duplicates and only one should remain.",
335
+ },
336
+ {
337
+ "type": "conflict",
338
+ "rows": [23, 24],
339
+ "description": "Rows 23 and 24 conflict for the same customer and must be reconciled into one trustworthy record.",
340
+ },
341
+ {
342
+ "type": "invalid_format",
343
+ "row": 23,
344
+ "column": "email",
345
+ "description": "Row 23 contains an invalid email format.",
346
+ },
347
+ {
348
+ "type": "invalid_format",
349
+ "row": 23,
350
+ "column": "phone",
351
+ "description": "Row 23 contains an invalid phone format.",
352
+ },
353
+ {
354
+ "type": "constraint_violation",
355
+ "constraint": "unique_email",
356
+ "rows": [26, 27],
357
+ "description": "Rows 26 and 27 violate the unique email constraint.",
358
+ },
359
+ {
360
+ "type": "valid_trap",
361
+ "row": 28,
362
+ "description": "Row 28 is valid even though the plus-address format may look suspicious.",
363
+ },
364
+ {
365
+ "type": "valid_trap",
366
+ "row": 25,
367
+ "description": "Row 25 is valid even though the name abbreviation may look inconsistent.",
368
+ },
369
+ ],
370
+ "constraints": [
371
+ "Email values must be unique across the final table.",
372
+ "Every remaining row must represent a single coherent customer record.",
373
+ "Do not modify valid rows just because they look unusual.",
374
+ "Prefer correction and conflict resolution over unnecessary deletion.",
375
+ ],
376
+ "max_steps": 14,
377
+ },
378
+ {
379
+ "goal": "Resolve conflicting records, enforce unique email constraints, fix invalid formats, and preserve valid but unusual data.",
380
+ "difficulty": "hard",
381
+ "variant_id": "hard_account_merges",
382
+ "required_columns": ["name", "email", "phone", "status"],
383
+ "expected_outcome": {
384
+ "expected_row_count_range": {"min": 5, "max": 6},
385
+ "unique_by": ["email"],
386
+ "format_rules": {"email": "valid_email", "phone": "normalized_phone"},
387
+ "exactly_one_of_rows": [[51, 52], [53, 54], [56, 57]],
388
+ "must_preserve_valid_rows": [55, 58],
389
+ "validation_rules": [
390
+ "Exactly one of rows 51 or 52 should remain after deduplication.",
391
+ "Exactly one of rows 53 or 54 should remain after conflict resolution.",
392
+ "Exactly one of rows 56 or 57 should remain after enforcing email uniqueness.",
393
+ "No two remaining rows should share the same email address.",
394
+ "All remaining emails should satisfy a valid email format.",
395
+ "All remaining phone values should be normalized to a consistent valid format.",
396
+ "Rows 55 and 58 should remain logically unchanged because they are valid trap rows.",
397
+ ],
398
+ },
399
+ "initial_table": [
400
+ {"row_id": 51, "customer_id": "A900", "name": "Lena Brooks", "email": "lena.brooks@example.com", "phone": "212-555-0111", "status": "active"},
401
+ {"row_id": 52, "customer_id": "A900", "name": "Lena Brooks", "email": "lena.brooks@example.com", "phone": "212-555-0111", "status": "active"},
402
+ {"row_id": 53, "customer_id": "A901", "name": "Ravi Shah", "email": "ravi.shah example.com", "phone": "6465550112", "status": "active"},
403
+ {"row_id": 54, "customer_id": "A901", "name": "Ravi Shah", "email": "ravi.shah@example.com", "phone": "646-555-0112", "status": "inactive"},
404
+ {"row_id": 55, "customer_id": "A902", "name": "M. E. Klein", "email": "mek@example.com", "phone": "+1-303-555-0113", "status": "active"},
405
+ {"row_id": 56, "customer_id": "A903", "name": "Sana Noor", "email": "ops@example.com", "phone": "718-555-0114", "status": "active"},
406
+ {"row_id": 57, "customer_id": "A904", "name": "Sana N.", "email": "ops@example.com", "phone": "718-555-0115", "status": "active"},
407
+ {"row_id": 58, "customer_id": "A905", "name": "Bo Li", "email": "bo.li+archive@example.com", "phone": "415-555-0116", "status": "active"},
408
+ ],
409
+ "hidden_issues": [
410
+ {
411
+ "type": "duplicate",
412
+ "rows": [51, 52],
413
+ "description": "Rows 51 and 52 are exact duplicates and only one should remain.",
414
+ },
415
+ {
416
+ "type": "conflict",
417
+ "rows": [53, 54],
418
+ "description": "Rows 53 and 54 conflict for the same customer and must be reconciled into one trustworthy record.",
419
+ },
420
+ {
421
+ "type": "invalid_format",
422
+ "row": 53,
423
+ "column": "email",
424
+ "description": "Row 53 contains an invalid email format.",
425
+ },
426
+ {
427
+ "type": "invalid_format",
428
+ "row": 53,
429
+ "column": "phone",
430
+ "description": "Row 53 contains an invalid phone format.",
431
+ },
432
+ {
433
+ "type": "constraint_violation",
434
+ "constraint": "unique_email",
435
+ "rows": [56, 57],
436
+ "description": "Rows 56 and 57 violate the unique email constraint.",
437
+ },
438
+ {
439
+ "type": "valid_trap",
440
+ "row": 55,
441
+ "description": "Row 55 is valid even though the abbreviated name may look unusual.",
442
+ },
443
+ {
444
+ "type": "valid_trap",
445
+ "row": 58,
446
+ "description": "Row 58 is valid even though the plus-address format may look suspicious.",
447
+ },
448
+ ],
449
+ "constraints": [
450
+ "Email values must be unique across the final table.",
451
+ "Every remaining row must represent a single coherent customer record.",
452
+ "Do not modify valid rows just because they look unusual.",
453
+ "Prefer correction and conflict resolution over unnecessary deletion.",
454
+ ],
455
+ "max_steps": 14,
456
+ },
457
+ ]
458
+ return _pick_variant(variant, variants)
459
+
460
+
461
+ easy_cleaning_task.variant_count = 2
462
+ medium_normalization_task.variant_count = 2
463
+ hard_conflict_resolution_task.variant_count = 2
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Utility package for the dataops OpenEnv environment."""
utils/__pycache__/helpers.cpython-313.pyc ADDED
Binary file (396 Bytes). View file
 
utils/helpers.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility helpers for ``dataops-gym``.
2
+
3
+ This module is responsible for small shared helper functions that support the
4
+ environment without owning core business or orchestration logic.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ # TODO: Add reusable helper utilities with clear, narrow responsibilities.
11
+ # TODO: Avoid placing core domain logic in shared helpers.
uv.lock ADDED
The diff for this file is too large to render. See raw diff