temp / read.py
jiyatai's picture
Upload ./read.py with huggingface_hub
3db4484 verified
from datasets import load_dataset
import json
from tqdm import tqdm
# capture = load_dataset("/group/40005/public_datasets/DetailCaps-4870")
capture = load_dataset("parquet", data_files={"test": "/group/40005/public_datasets/DetailCaps-4870/DetailCaps-4870.parquet"})['test']
print(len(capture))
save_dir = "/group/40005/auroraji/CAPTURE/samples"
anno = {}
for i, instance in tqdm(enumerate(capture)):
img_binary = instance['binary']
anno[i] = [instance['GT_Caption_GPT4V'], instance['GT_Caption_GPT4O'], instance['GT_Caption_Gemini15Pro']]
with open(f"{save_dir}/{i}.png", "wb") as f:
f.write(img_binary)
with open("annotations.json", "w") as f:
json.dump(anno, f, indent=4)