File size: 703 Bytes
3db4484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from datasets import load_dataset
import json
from tqdm import tqdm

# capture = load_dataset("/group/40005/public_datasets/DetailCaps-4870")
capture = load_dataset("parquet", data_files={"test": "/group/40005/public_datasets/DetailCaps-4870/DetailCaps-4870.parquet"})['test']
print(len(capture))
save_dir = "/group/40005/auroraji/CAPTURE/samples"
anno = {}

for i, instance in tqdm(enumerate(capture)):
    img_binary = instance['binary']
    anno[i] = [instance['GT_Caption_GPT4V'], instance['GT_Caption_GPT4O'], instance['GT_Caption_Gemini15Pro']]

    with open(f"{save_dir}/{i}.png", "wb") as f:
        f.write(img_binary)

with open("annotations.json", "w") as f:
    json.dump(anno, f, indent=4)