YUNTA88 commited on
Commit
269b7e3
·
verified ·
1 Parent(s): 2312636

Upload root_scripts/eval_phyx_50000_final.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. root_scripts/eval_phyx_50000_final.py +258 -0
root_scripts/eval_phyx_50000_final.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_phyx_50000.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_phyx_50000/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # SKIP BASE
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()