rl4phyx-backup / root_scripts /make_val_parquet.py
YUNTA88's picture
Upload root_scripts/make_val_parquet.py with huggingface_hub
7735006 verified
import json, pandas as pd, os
src = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/inference_results_base.jsonl"
with open(src) as f:
lines = [json.loads(l) for l in f if l.strip()]
# Use absolute path for test images
IMAGE_BASE = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
rows = []
for r in lines:
idx = r.get("index", 0)
cat = r.get("category", "unknown")
subfield = r.get("subfield", "")
gt_value = str(r.get("ground_truth_value", "")).strip()
question = r.get("question", "")
prompt_text = f"Look at the image and answer the physics question.\n\n{question}\n\nPlease reason step by step and put your final answer (with units if applicable) in \\boxed{{}}."
# Use absolute path for image
abs_image_path = os.path.join(IMAGE_BASE, f"{idx}.png")
row = {
"data_source": "metaphyx_physics",
"prompt": [{"content": prompt_text, "role": "user"}],
"ability": "physics",
"reward_model": {"ground_truth": gt_value, "style": "rule"},
"extra_info": {
"category": cat,
"subfield": subfield,
"index": idx,
"image_path": abs_image_path,
"split": "test",
},
}
rows.append(row)
df = pd.DataFrame(rows)
out_path = "/workspace/rl4phyx/RL4Phyx/oneshot/validation_data/metaphyx_oe_1533.parquet"
df.to_parquet(out_path, index=False)
# Verify
df2 = pd.read_parquet(out_path)
print(f"Saved: {out_path}")
print(f"Shape: {df2.shape}")
# Check first image exists
img0 = df2.iloc[0]["extra_info"]["image_path"]
print(f"First image: {img0}")
print(f"Exists: {os.path.exists(img0)}")
# Check a few more
for i in [0, 100, 500, 1000, 1532]:
ip = df2.iloc[i]["extra_info"]["image_path"]
print(f" [{i}] {os.path.basename(ip)}: exists={os.path.exists(ip)}")