avanigupta Claude Opus 4.6 (1M context) commited on
Commit
8560706
Β·
1 Parent(s): 887c1aa

fix moderation issue row collisions and verify all data

Browse files

- Move out_of_range issue to row 13 (was colliding with row 29 duplicate)
- Move duplicate to row 30 (clean separation)
- Full audit: 46 issues across 5 tasks all verified correct
- All moderation label flips are deterministic
- 128 tests passing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

dataqa_env/server/gradio_ui.py CHANGED
@@ -168,16 +168,16 @@ AGENT_TRAJECTORIES = {
168
  "row:16,col:hate,issue:inconsistent_value",
169
  "row:17,col:harassment,issue:inconsistent_value",
170
  "row:20,col:violence,issue:inconsistent_value",
171
- "row:11,col:hate,issue:out_of_range",
172
  "row:15,col:text,issue:missing_value",
173
- "row:29,col:text,issue:duplicate_row",
174
  ],
175
  "fixes": [],
176
  },
177
  {
178
  "issues": [
179
  "row:5,col:hate,issue:inconsistent_value",
180
- "row:11,col:hate,issue:out_of_range",
181
  "row:15,col:text,issue:missing_value",
182
  "row:16,col:hate,issue:inconsistent_value",
183
  "row:17,col:harassment,issue:inconsistent_value",
@@ -185,7 +185,7 @@ AGENT_TRAJECTORIES = {
185
  "row:22,col:self_harm,issue:inconsistent_value",
186
  "row:24,col:hate,issue:inconsistent_value",
187
  "row:26,col:harassment,issue:inconsistent_value",
188
- "row:29,col:text,issue:duplicate_row",
189
  ],
190
  "fixes": [
191
  # All deterministic label flips:
@@ -196,7 +196,7 @@ AGENT_TRAJECTORIES = {
196
  "row:5,col:hate,fix:0", # false positive on idiom β†’ unflag
197
  "row:24,col:hate,fix:1", # subset rule: hate_threatening needs hate
198
  "row:26,col:harassment,fix:0", # false positive on moral judgment β†’ unflag
199
- "row:11,col:hate,fix:0", # out of range 2 β†’ 0
200
  ],
201
  },
202
  ],
 
168
  "row:16,col:hate,issue:inconsistent_value",
169
  "row:17,col:harassment,issue:inconsistent_value",
170
  "row:20,col:violence,issue:inconsistent_value",
171
+ "row:13,col:violence,issue:out_of_range",
172
  "row:15,col:text,issue:missing_value",
173
+ "row:30,col:text,issue:duplicate_row",
174
  ],
175
  "fixes": [],
176
  },
177
  {
178
  "issues": [
179
  "row:5,col:hate,issue:inconsistent_value",
180
+ "row:13,col:violence,issue:out_of_range",
181
  "row:15,col:text,issue:missing_value",
182
  "row:16,col:hate,issue:inconsistent_value",
183
  "row:17,col:harassment,issue:inconsistent_value",
 
185
  "row:22,col:self_harm,issue:inconsistent_value",
186
  "row:24,col:hate,issue:inconsistent_value",
187
  "row:26,col:harassment,issue:inconsistent_value",
188
+ "row:30,col:text,issue:duplicate_row",
189
  ],
190
  "fixes": [
191
  # All deterministic label flips:
 
196
  "row:5,col:hate,fix:0", # false positive on idiom β†’ unflag
197
  "row:24,col:hate,fix:1", # subset rule: hate_threatening needs hate
198
  "row:26,col:harassment,fix:0", # false positive on moral judgment β†’ unflag
199
+ "row:13,col:violence,fix:0", # out of range 3 β†’ 0
200
  ],
201
  },
202
  ],
dataqa_env/server/tasks.py CHANGED
@@ -929,10 +929,10 @@ def create_task_moderation(seed: int = 42) -> Task:
929
  difficulty=1.5))
930
 
931
  # Issue 7: Label value out of range (deterministic: must be 0 or 1)
932
- r = 10 # row 11
933
- data[r][3] = "2" # hate=2 β€” invalid, must be 0 or 1
934
- issues.append(PlantedIssue(row=r + 1, col="hate", issue_type="out_of_range",
935
- description="hate label is 2 but must be 0 or 1 β€” should be 0 (text is about immigration policy not hate speech)",
936
  difficulty=1.0))
937
 
938
  # Issue 8: Empty text field (data pipeline artifact)
@@ -943,16 +943,15 @@ def create_task_moderation(seed: int = 42) -> Task:
943
  difficulty=1.5))
944
 
945
  # Issue 9: Duplicate text (dedup failure)
946
- r = 28 # row 29, copy text from row 4
947
- data[r][1] = data[3][1] # copy from row 4
948
  issues.append(PlantedIssue(row=r + 1, col="text", issue_type="duplicate_row",
949
  description="Text is identical to row 4 β€” duplicate moderation example",
950
  difficulty=1.5))
951
 
952
- # Issue 10: Restaurant review falsely flagged as harassment
953
- r = 28 # already changed text above, use different row
954
- r = 25 # row 26, "ashamed of themselves" β€” strong language but not harassment
955
- data[r][5] = "1" # harassment=1 but it's moral judgment, not harassment
956
  issues.append(PlantedIssue(row=r + 1, col="harassment", issue_type="inconsistent_value",
957
  description="Text expresses moral judgment ('should be ashamed') but is not harassment β€” harassment should be 0",
958
  difficulty=2.5))
 
929
  difficulty=1.5))
930
 
931
  # Issue 7: Label value out of range (deterministic: must be 0 or 1)
932
+ r = 12 # row 13, neutral text
933
+ data[r][4] = "3" # violence=3 β€” invalid, must be 0 or 1
934
+ issues.append(PlantedIssue(row=r + 1, col="violence", issue_type="out_of_range",
935
+ description="violence label is 3 but must be 0 or 1 β€” should be 0 (text is about adjusting strategy not violence)",
936
  difficulty=1.0))
937
 
938
  # Issue 8: Empty text field (data pipeline artifact)
 
943
  difficulty=1.5))
944
 
945
  # Issue 9: Duplicate text (dedup failure)
946
+ r = 29 # row 30, copy text from row 4
947
+ data[r][1] = data[3][1] # copy from row 4 ("respectful of other people")
948
  issues.append(PlantedIssue(row=r + 1, col="text", issue_type="duplicate_row",
949
  description="Text is identical to row 4 β€” duplicate moderation example",
950
  difficulty=1.5))
951
 
952
+ # Issue 10: Moral judgment falsely flagged as harassment
953
+ r = 25 # row 26, "ashamed of themselves" β€” strong but not harassment
954
+ data[r][5] = "1" # harassment=1 but it's moral judgment
 
955
  issues.append(PlantedIssue(row=r + 1, col="harassment", issue_type="inconsistent_value",
956
  description="Text expresses moral judgment ('should be ashamed') but is not harassment β€” harassment should be 0",
957
  difficulty=2.5))