File size: 753 Bytes
c1b68f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from PIL import Image
import torch
from datasets import load_dataset, get_dataset_split_names
import pandas as pd
dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation", cache_dir="cache", streaming=False)
index_range = 1000
## can see index here
df = pd.DataFrame(columns=['ques', 'label', 'q_id', 'img_path', 'question_type'])
for idx in range(index_range):
sample = dataset[idx]
ques = sample['question']
img = sample['image']
img_id = sample['image_id']
img.save(f'images/{img_id}.jpg')
label = sample['multiple_choice_answer']
q_id = sample['question_id']
q_type = sample['question_type']
df.loc[len(df.index)] = [ques, label, q_id, img_id, q_type]
df.to_json('vqa_samples.json', orient='columns')
|