VD10 commited on
Commit
37896ae
·
verified ·
1 Parent(s): d73c869

Upload run_patchjudge.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_patchjudge.py +302 -0
run_patchjudge.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """PatchJudge — Main runner script.
3
+
4
+ Runs the full PatchJudge pipeline:
5
+ 1. Load SWE-bench Verified + agent patches
6
+ 2. Extract features
7
+ 3. Judge patches with LLM
8
+ 4. Validate results
9
+ 5. Save everything
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import logging
15
+ import os
16
+ import sys
17
+ import time
18
+ from pathlib import Path
19
+ from collections import defaultdict
20
+
21
+ # Setup
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
25
+ )
26
+ logger = logging.getLogger("patchjudge")
27
+
28
+
29
+ def run_data_loading(args):
30
+ """Task 1: Load and prepare the dataset."""
31
+ from patchjudge.data_loader import SWEBenchLoader, get_diff_stats
32
+
33
+ print("\n" + "=" * 70)
34
+ print(" Task 1: Data Loading & SWE-bench Setup")
35
+ print("=" * 70)
36
+
37
+ loader = SWEBenchLoader(cache_dir=args.data_dir)
38
+
39
+ # Load gold data
40
+ gold = loader.load_gold_data()
41
+ print(f"\n✅ Loaded {len(gold)} SWE-bench Verified instances")
42
+
43
+ # Load agent patches from HF datasets
44
+ sources = args.sources.split(",") if args.sources else ["coderforge", "o1"]
45
+ examples = loader.build_dataset(sources=sources)
46
+
47
+ # Print stats
48
+ passed = sum(1 for e in examples if e.test_passed)
49
+ failed = len(examples) - passed
50
+ repos = set(e.repo for e in examples)
51
+ agents = set(e.agent_name for e in examples)
52
+ instances = set(e.instance_id for e in examples)
53
+
54
+ print(f"\n📊 Dataset Summary:")
55
+ print(f" Total examples: {len(examples)}")
56
+ print(f" Test passed: {passed} ({passed/len(examples):.1%})")
57
+ print(f" Test failed: {failed} ({failed/len(examples):.1%})")
58
+ print(f" Unique instances: {len(instances)}")
59
+ print(f" Unique repos: {len(repos)}")
60
+ print(f" Agent sources: {agents}")
61
+
62
+ # Difficulty distribution
63
+ diff_counts = defaultdict(int)
64
+ for e in examples:
65
+ diff_counts[e.difficulty or "unknown"] += 1
66
+ print(f"\n Difficulty:")
67
+ for d, c in sorted(diff_counts.items()):
68
+ print(f" {d}: {c}")
69
+
70
+ # Repo distribution (top 10)
71
+ repo_counts = defaultdict(int)
72
+ for e in examples:
73
+ repo_counts[e.repo] += 1
74
+ print(f"\n Top repos:")
75
+ for repo, c in sorted(repo_counts.items(), key=lambda x: -x[1])[:10]:
76
+ print(f" {repo}: {c}")
77
+
78
+ # Diff stats summary
79
+ print(f"\n Patch size stats (agent patches):")
80
+ all_stats = [get_diff_stats(e.agent_patch) for e in examples]
81
+ for key in ["lines_added", "lines_removed", "files_changed", "hunks"]:
82
+ values = [s[key] for s in all_stats]
83
+ if values:
84
+ import statistics
85
+ print(f" {key}: mean={statistics.mean(values):.1f}, "
86
+ f"median={statistics.median(values):.0f}, "
87
+ f"max={max(values)}")
88
+
89
+ # Save
90
+ path = loader.save_dataset(examples)
91
+ print(f"\n💾 Saved to: {path}")
92
+
93
+ return examples, gold
94
+
95
+
96
+ def run_feature_extraction(examples, args):
97
+ """Task 2: Extract features from all patches."""
98
+ from patchjudge.feature_extractor import FeatureExtractor, extract_features_batch
99
+
100
+ print("\n" + "=" * 70)
101
+ print(" Task 2: Feature Extraction")
102
+ print("=" * 70)
103
+
104
+ results = extract_features_batch(examples, show_progress=True)
105
+ features_list = [f for _, f in results]
106
+
107
+ # Aggregate feature stats
108
+ print(f"\n📐 Feature Summary ({len(features_list)} patches):")
109
+
110
+ bool_features = [
111
+ 'has_error_handling', 'has_edge_case_handling', 'has_todos',
112
+ 'has_hardcoded_values', 'has_debug_statements', 'modifies_core_files',
113
+ 'has_imports_added', 'touches_tests',
114
+ ]
115
+
116
+ for feat in bool_features:
117
+ count = sum(1 for f in features_list if getattr(f, feat))
118
+ print(f" {feat:>30}: {count}/{len(features_list)} ({count/len(features_list):.1%})")
119
+
120
+ # Scope distribution
121
+ scope_counts = defaultdict(int)
122
+ for f in features_list:
123
+ scope_counts[f.change_scope] += 1
124
+ print(f"\n Change scope:")
125
+ for scope, c in sorted(scope_counts.items()):
126
+ print(f" {scope}: {c}")
127
+
128
+ # Keyword coverage
129
+ coverages = [f.keyword_coverage_ratio for f in features_list]
130
+ if coverages:
131
+ import statistics
132
+ print(f"\n Keyword coverage: "
133
+ f"mean={statistics.mean(coverages):.2f}, "
134
+ f"median={statistics.median(coverages):.2f}")
135
+
136
+ # Save features
137
+ features_path = Path(args.data_dir) / "features.jsonl"
138
+ with open(features_path, 'w') as f:
139
+ for ex, feat in results:
140
+ f.write(json.dumps({
141
+ "instance_id": ex.instance_id,
142
+ "agent_name": ex.agent_name,
143
+ "features": feat.to_dict(),
144
+ }) + "\n")
145
+ print(f"\n💾 Features saved to: {features_path}")
146
+
147
+ return features_list
148
+
149
+
150
+ def run_judging(examples, features_list, args):
151
+ """Task 3: LLM Judge evaluation."""
152
+ from patchjudge.judge import PatchJudge
153
+
154
+ print("\n" + "=" * 70)
155
+ print(" Task 3: LLM Judge Evaluation")
156
+ print("=" * 70)
157
+
158
+ # Select subset for judging
159
+ n = min(args.judge_count, len(examples))
160
+
161
+ # Ensure mix of passed/failed
162
+ passed = [i for i, e in enumerate(examples) if e.test_passed]
163
+ failed = [i for i, e in enumerate(examples) if not e.test_passed]
164
+
165
+ # Take proportional split
166
+ n_passed = min(len(passed), int(n * 0.6))
167
+ n_failed = min(len(failed), n - n_passed)
168
+ n_passed = n - n_failed # Adjust if not enough failed
169
+
170
+ selected_idx = passed[:n_passed] + failed[:n_failed]
171
+ selected_examples = [examples[i] for i in selected_idx]
172
+ selected_features = [features_list[i] for i in selected_idx] if features_list else None
173
+
174
+ print(f"\n🔍 Judging {len(selected_examples)} patches "
175
+ f"({n_passed} passed, {n_failed} failed)")
176
+ print(f" Model: {args.model_id}")
177
+
178
+ judge = PatchJudge(
179
+ model_id=args.model_id,
180
+ temperature=0.1,
181
+ max_tokens=2000,
182
+ )
183
+
184
+ start = time.time()
185
+ results = judge.judge_batch(
186
+ selected_examples,
187
+ selected_features,
188
+ show_progress=True,
189
+ )
190
+ elapsed = time.time() - start
191
+
192
+ print(f"\n⏱️ Judging complete in {elapsed:.1f}s "
193
+ f"({elapsed/len(selected_examples):.1f}s per patch)")
194
+
195
+ # Save results
196
+ results_path = Path(args.data_dir) / "judge_results.jsonl"
197
+ with open(results_path, 'w') as f:
198
+ for ex, r in zip(selected_examples, results):
199
+ f.write(json.dumps({
200
+ "instance_id": ex.instance_id,
201
+ "agent_name": ex.agent_name,
202
+ "test_passed": ex.test_passed,
203
+ "merge_score": r.merge_score,
204
+ "dimension_scores": r.dimension_scores,
205
+ "model_used": r.model_used,
206
+ }) + "\n")
207
+ print(f"💾 Results saved to: {results_path}")
208
+
209
+ return selected_examples, results, judge
210
+
211
+
212
+ def run_validation(examples, results, gold_data, judge, args):
213
+ """Task 4: Validate PatchJudge against ground truth."""
214
+ from patchjudge.validation import run_full_validation
215
+
216
+ print("\n" + "=" * 70)
217
+ print(" Task 4: Validation")
218
+ print("=" * 70)
219
+
220
+ gold_list = list(gold_data.values())[:50] if gold_data else None
221
+
222
+ vr, report = run_full_validation(
223
+ examples=examples,
224
+ results=results,
225
+ gold_data=gold_list,
226
+ judge=judge if args.validate_known_bad else None,
227
+ )
228
+
229
+ print(report)
230
+
231
+ # Save validation results
232
+ val_path = Path(args.data_dir) / "validation_results.json"
233
+ with open(val_path, 'w') as f:
234
+ json.dump(vr.to_dict(), f, indent=2)
235
+ print(f"\n💾 Validation results saved to: {val_path}")
236
+
237
+ # Save full report
238
+ report_path = Path(args.data_dir) / "validation_report.txt"
239
+ with open(report_path, 'w') as f:
240
+ f.write(report)
241
+ print(f"💾 Report saved to: {report_path}")
242
+
243
+ return vr
244
+
245
+
246
+ def main():
247
+ parser = argparse.ArgumentParser(description="PatchJudge - Post-Test Code Quality Scorer")
248
+ parser.add_argument("--data-dir", default="data", help="Data directory")
249
+ parser.add_argument("--sources", default="coderforge,o1",
250
+ help="Comma-separated data sources: coderforge,o1,s3")
251
+ parser.add_argument("--model-id", default="Qwen/Qwen2.5-Coder-32B-Instruct",
252
+ help="LLM model for judging")
253
+ parser.add_argument("--judge-count", type=int, default=50,
254
+ help="Number of patches to judge")
255
+ parser.add_argument("--validate-known-bad", action="store_true",
256
+ help="Also generate and judge known-bad patches for validation")
257
+ parser.add_argument("--tasks", default="1,2,3,4",
258
+ help="Comma-separated task numbers to run (1=load, 2=features, 3=judge, 4=validate)")
259
+ parser.add_argument("--load-cached", action="store_true",
260
+ help="Load previously saved dataset instead of re-downloading")
261
+
262
+ args = parser.parse_args()
263
+ tasks = [int(t) for t in args.tasks.split(",")]
264
+
265
+ os.makedirs(args.data_dir, exist_ok=True)
266
+
267
+ examples = None
268
+ features_list = None
269
+ results = None
270
+ gold_data = None
271
+ judge = None
272
+
273
+ # Task 1: Data Loading
274
+ if 1 in tasks:
275
+ if args.load_cached:
276
+ from patchjudge.data_loader import SWEBenchLoader
277
+ loader = SWEBenchLoader(cache_dir=args.data_dir)
278
+ examples = loader.load_saved_dataset()
279
+ gold_data = loader.load_gold_data()
280
+ else:
281
+ examples, gold_data = run_data_loading(args)
282
+
283
+ # Task 2: Feature Extraction
284
+ if 2 in tasks and examples:
285
+ features_list = run_feature_extraction(examples, args)
286
+
287
+ # Task 3: LLM Judging
288
+ if 3 in tasks and examples:
289
+ if features_list is None:
290
+ # Extract features first
291
+ features_list = run_feature_extraction(examples, args)
292
+ examples, results, judge = run_judging(examples, features_list, args)
293
+
294
+ # Task 4: Validation
295
+ if 4 in tasks and results:
296
+ run_validation(examples, results, gold_data, judge, args)
297
+
298
+ print("\n✅ PatchJudge pipeline complete!")
299
+
300
+
301
+ if __name__ == "__main__":
302
+ main()