File size: 753 Bytes
c1b68f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from PIL import Image
import torch
from datasets import load_dataset, get_dataset_split_names
import pandas as pd
dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation", cache_dir="cache", streaming=False)

index_range = 1000

## can see index here
df = pd.DataFrame(columns=['ques', 'label', 'q_id', 'img_path', 'question_type'])

for idx in range(index_range):
    sample = dataset[idx]
    ques = sample['question']
    img = sample['image']
    img_id = sample['image_id']
    img.save(f'images/{img_id}.jpg')
    label = sample['multiple_choice_answer']
    q_id = sample['question_id']
    q_type = sample['question_type']
    df.loc[len(df.index)] = [ques, label, q_id, img_id, q_type]

df.to_json('vqa_samples.json', orient='columns')