from PIL import Image import torch from datasets import load_dataset, get_dataset_split_names import pandas as pd dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation", cache_dir="cache", streaming=False) index_range = 1000 ## can see index here df = pd.DataFrame(columns=['ques', 'label', 'q_id', 'img_path', 'question_type']) for idx in range(index_range): sample = dataset[idx] ques = sample['question'] img = sample['image'] img_id = sample['image_id'] img.save(f'images/{img_id}.jpg') label = sample['multiple_choice_answer'] q_id = sample['question_id'] q_type = sample['question_type'] df.loc[len(df.index)] = [ques, label, q_id, img_id, q_type] df.to_json('vqa_samples.json', orient='columns')