|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
|
|
|
import datasets |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
_CITATION = """https://allenai.org/data/diagrams""" |
|
|
_DESCRIPTION = "AI2D is a dataset of illustrative diagrams for research on diagram understanding and associated question answering." |
|
|
|
|
|
|
|
|
def get_builder_config(VERSION): |
|
|
builder_config = [ |
|
|
datasets.BuilderConfig( |
|
|
name=f"ai2d", |
|
|
version=VERSION, |
|
|
description=f"ai2d", |
|
|
) |
|
|
] |
|
|
return builder_config |
|
|
|
|
|
|
|
|
dataset_features = { |
|
|
"question": datasets.Value("string"), |
|
|
"options": datasets.features.Sequence(datasets.Value("string")), |
|
|
"answer": datasets.Value("string"), |
|
|
"image": datasets.Image(), |
|
|
} |
|
|
|
|
|
|
|
|
class AI2D(datasets.GeneratorBasedBuilder): |
|
|
VERSION = datasets.Version("1.0.0") |
|
|
|
|
|
BUILDER_CONFIGS = get_builder_config(VERSION) |
|
|
|
|
|
def _info(self): |
|
|
features = datasets.Features(dataset_features) |
|
|
return datasets.DatasetInfo( |
|
|
|
|
|
description=_DESCRIPTION, |
|
|
|
|
|
features=features, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
citation=_CITATION, |
|
|
) |
|
|
|
|
|
def _split_generators(self, dl_manager): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_path = "/path/to/ai2d/images" |
|
|
annotation_path = "/path/to/ai2d/questions" |
|
|
|
|
|
test_annotation_path = "/path/to/Qwen-VL/data/ai2diagram/test.jsonl" |
|
|
return [ |
|
|
datasets.SplitGenerator( |
|
|
name=datasets.Split.TEST, |
|
|
gen_kwargs={"annotation": annotation_path, "images": image_path, "test_annotation": test_annotation_path}, |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
def _generate_examples(self, annotation, images, test_annotation): |
|
|
|
|
|
with open(test_annotation, encoding="utf-8") as f: |
|
|
test_annotation = [json.loads(line) for line in f] |
|
|
test_qn_ids = {x["question_id"] for x in test_annotation} |
|
|
index = -1 |
|
|
|
|
|
for sub_annotation in os.listdir(annotation): |
|
|
sub_annotation = os.path.join(annotation, sub_annotation) |
|
|
with open(sub_annotation, encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
image = data["imageName"] |
|
|
image_path = os.path.join(images, image) |
|
|
for question in data["questions"]: |
|
|
if data["questions"]["questionId"] in test_qn_ids: |
|
|
index += 1 |
|
|
options = data["questions"][question]["answerTexts"] |
|
|
answer = data["questions"][question]["correctAnswer"] |
|
|
|
|
|
now_data = {} |
|
|
now_data["image"] = Image.open(image_path) |
|
|
now_data["question"] = question |
|
|
now_data["answer"] = answer |
|
|
now_data["options"] = options |
|
|
yield index, now_data |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
from datasets import load_dataset |
|
|
|
|
|
data = load_dataset( |
|
|
"/path/to/lmms-eval/lmms_eval/tasks/ai2d/upload_ai2d.py", |
|
|
) |
|
|
data.push_to_hub("lmms-lab/ai2d", private=True) |
|
|
|