VD10 commited on
Commit
d73c869
·
verified ·
1 Parent(s): 59ef264

Upload patchjudge/validation.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. patchjudge/validation.py +556 -0
patchjudge/validation.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Validation module for PatchJudge.
2
+
3
+ Validates that PatchJudge scores correlate with actual code quality:
4
+ 1. METR alignment: ~50% of test-passing patches should score below 50
5
+ 2. Known-bad pattern detection: deliberately bad patches should score low
6
+ 3. Score distribution analysis
7
+ 4. Resolved vs unresolved separation
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import statistics
13
+ from collections import defaultdict
14
+ from typing import Optional
15
+
16
+ from patchjudge.models import (
17
+ PatchExample, PatchFeatures, JudgeResult, ValidationResult
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ============================================================================
24
+ # Known-Bad Patch Generator
25
+ # ============================================================================
26
+
27
+ class KnownBadPatchGenerator:
28
+ """Generate deliberately bad patches that pass tests but are low quality."""
29
+
30
+ @staticmethod
31
+ def generate_all(gold_examples: list[dict]) -> list[PatchExample]:
32
+ """Generate known-bad variants for a set of gold examples.
33
+
34
+ Args:
35
+ gold_examples: List of dicts with keys:
36
+ instance_id, repo, problem_statement, gold_patch, base_commit
37
+
38
+ Returns:
39
+ List of PatchExample with known-bad patches.
40
+ """
41
+ bad_patches = []
42
+ generator = KnownBadPatchGenerator()
43
+
44
+ for ex in gold_examples[:50]: # Generate from up to 50 examples
45
+ variants = generator._generate_variants(ex)
46
+ bad_patches.extend(variants)
47
+
48
+ logger.info(f"Generated {len(bad_patches)} known-bad patch variants")
49
+ return bad_patches
50
+
51
+ def _generate_variants(self, ex: dict) -> list[PatchExample]:
52
+ """Generate known-bad variants of a gold patch."""
53
+ variants = []
54
+ gold = ex.get("gold_patch", "")
55
+
56
+ if not gold:
57
+ return variants
58
+
59
+ # Variant 1: Hardcoded return values
60
+ hardcoded = self._make_hardcoded_variant(ex)
61
+ if hardcoded:
62
+ variants.append(hardcoded)
63
+
64
+ # Variant 2: Pass statement (minimal no-op)
65
+ noop = self._make_noop_variant(ex)
66
+ if noop:
67
+ variants.append(noop)
68
+
69
+ # Variant 3: Comment-only patch
70
+ comment_only = self._make_comment_only_variant(ex)
71
+ if comment_only:
72
+ variants.append(comment_only)
73
+
74
+ # Variant 4: Overly broad try/except
75
+ broad_except = self._make_broad_except_variant(ex)
76
+ if broad_except:
77
+ variants.append(broad_except)
78
+
79
+ # Variant 5: Test-disabling patch
80
+ test_disable = self._make_test_disable_variant(ex)
81
+ if test_disable:
82
+ variants.append(test_disable)
83
+
84
+ return variants
85
+
86
+ def _make_hardcoded_variant(self, ex: dict) -> Optional[PatchExample]:
87
+ """Create a patch with hardcoded return values."""
88
+ gold = ex["gold_patch"]
89
+ lines = gold.split('\n')
90
+
91
+ # Find added lines with return statements and hardcode them
92
+ new_lines = []
93
+ modified = False
94
+ for line in lines:
95
+ if line.startswith('+') and not line.startswith('+++'):
96
+ content = line[1:]
97
+ if 'return' in content and not modified:
98
+ # Replace with hardcoded value
99
+ indent = len(content) - len(content.lstrip())
100
+ new_lines.append('+' + ' ' * indent + 'return True # HARDCODED')
101
+ modified = True
102
+ continue
103
+ new_lines.append(line)
104
+
105
+ if not modified:
106
+ return None
107
+
108
+ return PatchExample(
109
+ instance_id=ex["instance_id"] + "__hardcoded",
110
+ repo=ex["repo"],
111
+ problem_statement=ex["problem_statement"],
112
+ gold_patch=ex["gold_patch"],
113
+ agent_patch='\n'.join(new_lines),
114
+ agent_name="known-bad:hardcoded",
115
+ test_passed=True, # Assume it passes (test oracle weakness)
116
+ base_commit=ex.get("base_commit", ""),
117
+ difficulty=ex.get("difficulty", ""),
118
+ )
119
+
120
+ def _make_noop_variant(self, ex: dict) -> Optional[PatchExample]:
121
+ """Create a minimal no-op patch (just adds 'pass')."""
122
+ gold = ex["gold_patch"]
123
+ lines = gold.split('\n')
124
+
125
+ # Find the first hunk and replace all added lines with 'pass'
126
+ new_lines = []
127
+ in_hunk = False
128
+ added_pass = False
129
+
130
+ for line in lines:
131
+ if line.startswith('@@'):
132
+ in_hunk = True
133
+ new_lines.append(line)
134
+ continue
135
+
136
+ if in_hunk:
137
+ if line.startswith('+') and not line.startswith('+++'):
138
+ if not added_pass:
139
+ content = line[1:]
140
+ indent = len(content) - len(content.lstrip())
141
+ new_lines.append('+' + ' ' * indent + 'pass # TODO: implement')
142
+ added_pass = True
143
+ # Skip other added lines
144
+ continue
145
+ elif line.startswith('-') and not line.startswith('---'):
146
+ new_lines.append(line)
147
+ else:
148
+ new_lines.append(line)
149
+ else:
150
+ new_lines.append(line)
151
+
152
+ if not added_pass:
153
+ return None
154
+
155
+ return PatchExample(
156
+ instance_id=ex["instance_id"] + "__noop",
157
+ repo=ex["repo"],
158
+ problem_statement=ex["problem_statement"],
159
+ gold_patch=ex["gold_patch"],
160
+ agent_patch='\n'.join(new_lines),
161
+ agent_name="known-bad:noop",
162
+ test_passed=False,
163
+ base_commit=ex.get("base_commit", ""),
164
+ difficulty=ex.get("difficulty", ""),
165
+ )
166
+
167
+ def _make_comment_only_variant(self, ex: dict) -> Optional[PatchExample]:
168
+ """Create a patch that only adds comments, no real code changes."""
169
+ gold = ex["gold_patch"]
170
+ lines = gold.split('\n')
171
+
172
+ new_lines = []
173
+ modified = False
174
+
175
+ for line in lines:
176
+ if line.startswith('+') and not line.startswith('+++'):
177
+ content = line[1:]
178
+ indent = len(content) - len(content.lstrip())
179
+ # Replace real code with a comment
180
+ new_lines.append('+' + ' ' * indent + '# Fixed: ' + content.strip())
181
+ modified = True
182
+ elif line.startswith('-') and not line.startswith('---'):
183
+ # Keep the removal but don't add real replacement
184
+ new_lines.append(line)
185
+ else:
186
+ new_lines.append(line)
187
+
188
+ if not modified:
189
+ return None
190
+
191
+ return PatchExample(
192
+ instance_id=ex["instance_id"] + "__comment_only",
193
+ repo=ex["repo"],
194
+ problem_statement=ex["problem_statement"],
195
+ gold_patch=ex["gold_patch"],
196
+ agent_patch='\n'.join(new_lines),
197
+ agent_name="known-bad:comment-only",
198
+ test_passed=False,
199
+ base_commit=ex.get("base_commit", ""),
200
+ difficulty=ex.get("difficulty", ""),
201
+ )
202
+
203
+ def _make_broad_except_variant(self, ex: dict) -> Optional[PatchExample]:
204
+ """Create a patch that wraps everything in a broad try/except."""
205
+ gold = ex["gold_patch"]
206
+ lines = gold.split('\n')
207
+
208
+ # Find added lines and wrap them in try/except
209
+ new_lines = []
210
+ added_lines = []
211
+ min_indent = 999
212
+
213
+ for line in lines:
214
+ if line.startswith('+') and not line.startswith('+++'):
215
+ content = line[1:]
216
+ if content.strip():
217
+ indent = len(content) - len(content.lstrip())
218
+ min_indent = min(min_indent, indent)
219
+ added_lines.append(content)
220
+
221
+ if not added_lines or min_indent == 999:
222
+ return None
223
+
224
+ indent_str = ' ' * min_indent
225
+
226
+ # Reconstruct diff with try/except wrapper
227
+ for line in lines:
228
+ if line.startswith('+') and not line.startswith('+++'):
229
+ continue # Skip original added lines
230
+ elif line.startswith('@@') and added_lines:
231
+ new_lines.append(line)
232
+ # Add our try/except version
233
+ new_lines.append(f'+{indent_str}try:')
234
+ for al in added_lines:
235
+ new_lines.append(f'+ {al}')
236
+ new_lines.append(f'+{indent_str}except Exception:')
237
+ new_lines.append(f'+{indent_str} pass # Silently ignore all errors')
238
+ added_lines = [] # Only do this once
239
+ continue
240
+ new_lines.append(line)
241
+
242
+ return PatchExample(
243
+ instance_id=ex["instance_id"] + "__broad_except",
244
+ repo=ex["repo"],
245
+ problem_statement=ex["problem_statement"],
246
+ gold_patch=ex["gold_patch"],
247
+ agent_patch='\n'.join(new_lines),
248
+ agent_name="known-bad:broad-except",
249
+ test_passed=True,
250
+ base_commit=ex.get("base_commit", ""),
251
+ difficulty=ex.get("difficulty", ""),
252
+ )
253
+
254
+ def _make_test_disable_variant(self, ex: dict) -> Optional[PatchExample]:
255
+ """Create a patch that 'fixes' the issue by modifying tests."""
256
+ instance_id = ex["instance_id"]
257
+ repo = ex["repo"]
258
+
259
+ # Create a patch that adds pytest.mark.skip to failing tests
260
+ patch = f"""diff --git a/tests/test_fix.py b/tests/test_fix.py
261
+ --- a/tests/test_fix.py
262
+ +++ b/tests/test_fix.py
263
+ @@ -1,5 +1,7 @@
264
+ +import pytest
265
+ +
266
+ class TestFix:
267
+ - def test_issue(self):
268
+ + @pytest.mark.skip(reason="Known issue, will fix later")
269
+ + def test_issue(self):
270
+ # This test was failing
271
+ assert True
272
+ """
273
+ return PatchExample(
274
+ instance_id=instance_id + "__test_disable",
275
+ repo=repo,
276
+ problem_statement=ex["problem_statement"],
277
+ gold_patch=ex["gold_patch"],
278
+ agent_patch=patch,
279
+ agent_name="known-bad:test-disable",
280
+ test_passed=True,
281
+ base_commit=ex.get("base_commit", ""),
282
+ difficulty=ex.get("difficulty", ""),
283
+ )
284
+
285
+
286
+ # ============================================================================
287
+ # Validator
288
+ # ============================================================================
289
+
290
+ class PatchJudgeValidator:
291
+ """Validates PatchJudge scoring against ground truth."""
292
+
293
+ def __init__(self, merge_threshold: float = 50.0):
294
+ """
295
+ Args:
296
+ merge_threshold: Score below which a patch is considered "not merge-worthy".
297
+ """
298
+ self.merge_threshold = merge_threshold
299
+
300
+ def validate(
301
+ self,
302
+ examples: list[PatchExample],
303
+ results: list[JudgeResult],
304
+ known_bad_results: Optional[list[tuple[PatchExample, JudgeResult]]] = None,
305
+ ) -> ValidationResult:
306
+ """Run full validation suite.
307
+
308
+ Args:
309
+ examples: The patch examples that were judged.
310
+ results: The corresponding judge results.
311
+ known_bad_results: Optional list of (example, result) for known-bad patches.
312
+
313
+ Returns:
314
+ ValidationResult with all metrics.
315
+ """
316
+ assert len(examples) == len(results), "examples and results must match"
317
+
318
+ vr = ValidationResult(total_examples=len(examples))
319
+
320
+ scores = [r.merge_score for r in results]
321
+
322
+ # --- Score distribution ---
323
+ if scores:
324
+ vr.score_mean = statistics.mean(scores)
325
+ vr.score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
326
+ vr.score_median = statistics.median(scores)
327
+
328
+ # --- METR alignment ---
329
+ # Among test-passing patches, what fraction scores below threshold?
330
+ passed_scores = [
331
+ r.merge_score
332
+ for ex, r in zip(examples, results)
333
+ if ex.test_passed
334
+ ]
335
+ if passed_scores:
336
+ below_threshold = sum(1 for s in passed_scores if s < self.merge_threshold)
337
+ vr.test_passing_below_50_pct = below_threshold / len(passed_scores)
338
+
339
+ # --- Resolved vs Unresolved separation ---
340
+ resolved_scores = [
341
+ r.merge_score
342
+ for ex, r in zip(examples, results)
343
+ if ex.test_passed
344
+ ]
345
+ unresolved_scores = [
346
+ r.merge_score
347
+ for ex, r in zip(examples, results)
348
+ if not ex.test_passed
349
+ ]
350
+
351
+ if resolved_scores:
352
+ vr.mean_score_resolved = statistics.mean(resolved_scores)
353
+ if unresolved_scores:
354
+ vr.mean_score_unresolved = statistics.mean(unresolved_scores)
355
+
356
+ # Basic correlation: difference between resolved and unresolved means
357
+ if resolved_scores and unresolved_scores:
358
+ # Point-biserial-ish: just use the difference normalized
359
+ diff = vr.mean_score_resolved - vr.mean_score_unresolved
360
+ combined_std = statistics.stdev(scores) if len(scores) > 1 else 1.0
361
+ vr.score_resolved_correlation = min(1.0, max(-1.0, diff / max(combined_std, 0.01)))
362
+
363
+ # --- Known-bad detection ---
364
+ if known_bad_results:
365
+ vr.known_bad_total = len(known_bad_results)
366
+ vr.known_bad_detected = sum(
367
+ 1 for _, r in known_bad_results
368
+ if r.merge_score < self.merge_threshold
369
+ )
370
+ vr.known_bad_detection_rate = (
371
+ vr.known_bad_detected / vr.known_bad_total
372
+ if vr.known_bad_total > 0 else 0.0
373
+ )
374
+
375
+ # --- Per-dimension stats ---
376
+ dim_scores = defaultdict(list)
377
+ for r in results:
378
+ for dim, data in r.dimension_scores.items():
379
+ dim_scores[dim].append(data.get("score", 0))
380
+
381
+ for dim, ds in dim_scores.items():
382
+ if ds:
383
+ vr.dimension_stats[dim] = {
384
+ "mean": round(statistics.mean(ds), 2),
385
+ "std": round(statistics.stdev(ds) if len(ds) > 1 else 0.0, 2),
386
+ "median": statistics.median(ds),
387
+ "min": min(ds),
388
+ "max": max(ds),
389
+ }
390
+
391
+ return vr
392
+
393
+ def print_report(
394
+ self,
395
+ vr: ValidationResult,
396
+ examples: list[PatchExample],
397
+ results: list[JudgeResult],
398
+ ) -> str:
399
+ """Generate a human-readable validation report."""
400
+ lines = []
401
+ lines.append("=" * 70)
402
+ lines.append(" PatchJudge Validation Report")
403
+ lines.append("=" * 70)
404
+
405
+ lines.append(f"\n📊 Dataset: {vr.total_examples} examples")
406
+
407
+ # Score distribution
408
+ lines.append(f"\n📈 Score Distribution:")
409
+ lines.append(f" Mean: {vr.score_mean:.1f}")
410
+ lines.append(f" Median: {vr.score_median:.1f}")
411
+ lines.append(f" Std: {vr.score_std:.1f}")
412
+
413
+ # Score histogram
414
+ bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
415
+ scores = [r.merge_score for r in results]
416
+ hist = defaultdict(int)
417
+ for s in scores:
418
+ for i in range(len(bins) - 1):
419
+ if bins[i] <= s < bins[i+1]:
420
+ hist[f"{bins[i]}-{bins[i+1]}"] += 1
421
+ break
422
+ else:
423
+ hist[f"90-100"] += 1
424
+
425
+ lines.append(f"\n Score Distribution:")
426
+ for label in [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]:
427
+ count = hist.get(label, 0)
428
+ bar = "█" * count
429
+ lines.append(f" {label:>7}: {bar} ({count})")
430
+
431
+ # METR alignment
432
+ lines.append(f"\n🎯 METR Alignment:")
433
+ lines.append(
434
+ f" Test-passing patches below {self.merge_threshold}: "
435
+ f"{vr.test_passing_below_50_pct:.1%}"
436
+ )
437
+ metr_target = 0.50
438
+ if abs(vr.test_passing_below_50_pct - metr_target) < 0.15:
439
+ lines.append(f" ✅ ALIGNED with METR finding (~50% not merge-worthy)")
440
+ elif vr.test_passing_below_50_pct < metr_target - 0.15:
441
+ lines.append(f" ⚠️ Too lenient — scoring too many patches as merge-worthy")
442
+ else:
443
+ lines.append(f" ⚠️ Too harsh — scoring too many patches as not merge-worthy")
444
+
445
+ # Resolved vs Unresolved
446
+ lines.append(f"\n🔀 Resolved vs Unresolved Separation:")
447
+ lines.append(f" Mean score (resolved): {vr.mean_score_resolved:.1f}")
448
+ lines.append(f" Mean score (unresolved): {vr.mean_score_unresolved:.1f}")
449
+ lines.append(f" Separation: {vr.mean_score_resolved - vr.mean_score_unresolved:+.1f}")
450
+ lines.append(f" Correlation: {vr.score_resolved_correlation:.3f}")
451
+
452
+ # Known-bad detection
453
+ if vr.known_bad_total > 0:
454
+ lines.append(f"\n🚨 Known-Bad Pattern Detection:")
455
+ lines.append(
456
+ f" Detected: {vr.known_bad_detected}/{vr.known_bad_total} "
457
+ f"({vr.known_bad_detection_rate:.1%})"
458
+ )
459
+ if vr.known_bad_detection_rate >= 0.80:
460
+ lines.append(f" ✅ Good detection rate")
461
+ else:
462
+ lines.append(f" ⚠️ Detection rate below 80% — judge may be too lenient")
463
+
464
+ # Per-dimension stats
465
+ lines.append(f"\n📐 Per-Dimension Scores:")
466
+ for dim in ["correctness", "completeness", "code_quality",
467
+ "non_regression_risk", "merge_readiness"]:
468
+ stats = vr.dimension_stats.get(dim, {})
469
+ if stats:
470
+ lines.append(
471
+ f" {dim:>25}: "
472
+ f"mean={stats['mean']:.1f} "
473
+ f"std={stats['std']:.1f} "
474
+ f"[{stats['min']}-{stats['max']}]"
475
+ )
476
+
477
+ # Top flags
478
+ all_flags = defaultdict(int)
479
+ for r in results:
480
+ for dim, data in r.dimension_scores.items():
481
+ for flag in data.get("flags", []):
482
+ if flag and flag != "JUDGE_ERROR":
483
+ all_flags[flag] += 1
484
+
485
+ if all_flags:
486
+ lines.append(f"\n🏴 Most Common Flags:")
487
+ for flag, count in sorted(all_flags.items(), key=lambda x: -x[1])[:10]:
488
+ lines.append(f" {count:>4}x {flag}")
489
+
490
+ # Example best/worst
491
+ scored = list(zip(examples, results))
492
+ scored.sort(key=lambda x: x[1].merge_score, reverse=True)
493
+
494
+ if len(scored) >= 3:
495
+ lines.append(f"\n⭐ Top 3 Patches:")
496
+ for ex, r in scored[:3]:
497
+ lines.append(
498
+ f" {r.merge_score:5.1f} {ex.instance_id} "
499
+ f"({ex.agent_name}, {'PASS' if ex.test_passed else 'FAIL'})"
500
+ )
501
+
502
+ lines.append(f"\n💀 Bottom 3 Patches:")
503
+ for ex, r in scored[-3:]:
504
+ lines.append(
505
+ f" {r.merge_score:5.1f} {ex.instance_id} "
506
+ f"({ex.agent_name}, {'PASS' if ex.test_passed else 'FAIL'})"
507
+ )
508
+
509
+ lines.append("\n" + "=" * 70)
510
+
511
+ report = '\n'.join(lines)
512
+ return report
513
+
514
+
515
+ def run_full_validation(
516
+ examples: list[PatchExample],
517
+ results: list[JudgeResult],
518
+ gold_data: Optional[list[dict]] = None,
519
+ judge=None,
520
+ ) -> tuple[ValidationResult, str]:
521
+ """Run the complete validation pipeline.
522
+
523
+ Args:
524
+ examples: Judged patch examples.
525
+ results: Judge results for those examples.
526
+ gold_data: Gold standard data for generating known-bad patches.
527
+ judge: PatchJudge instance (needed if judging known-bad patches).
528
+
529
+ Returns:
530
+ (ValidationResult, report_string)
531
+ """
532
+ known_bad_results = None
533
+
534
+ # Generate and judge known-bad patches if we have gold data and a judge
535
+ if gold_data and judge:
536
+ logger.info("Generating known-bad patches...")
537
+ bad_patches = KnownBadPatchGenerator.generate_all(gold_data)
538
+
539
+ if bad_patches:
540
+ logger.info(f"Judging {len(bad_patches)} known-bad patches...")
541
+ bad_judge_results = judge.judge_batch(bad_patches, show_progress=True)
542
+ known_bad_results = list(zip(bad_patches, bad_judge_results))
543
+
544
+ # Print known-bad summary
545
+ for bp, br in known_bad_results[:5]:
546
+ logger.info(
547
+ f" Known-bad [{bp.agent_name}] "
548
+ f"{bp.instance_id}: {br.merge_score:.1f}/100"
549
+ )
550
+
551
+ # Run validation
552
+ validator = PatchJudgeValidator()
553
+ vr = validator.validate(examples, results, known_bad_results)
554
+ report = validator.print_report(vr, examples, results)
555
+
556
+ return vr, report