from PIL import Image | |
import torch | |
from datasets import load_dataset, get_dataset_split_names | |
import pandas as pd | |
dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation", cache_dir="cache", streaming=False) | |
index_range = 1000 | |
## can see index here | |
df = pd.DataFrame(columns=['ques', 'label', 'q_id', 'img_path', 'question_type']) | |
for idx in range(index_range): | |
sample = dataset[idx] | |
ques = sample['question'] | |
img = sample['image'] | |
img_id = sample['image_id'] | |
img.save(f'images/{img_id}.jpg') | |
label = sample['multiple_choice_answer'] | |
q_id = sample['question_id'] | |
q_type = sample['question_type'] | |
df.loc[len(df.index)] = [ques, label, q_id, img_id, q_type] | |
df.to_json('vqa_samples.json', orient='columns') | |