Spaces:
Runtime error
Runtime error
# Copyright (c) OpenMMLab. All rights reserved. | |
from typing import List | |
import mmengine | |
from mmengine.dataset import BaseDataset | |
from mmengine.fileio import get_file_backend | |
from mmpretrain.registry import DATASETS | |
class MiniGPT4Dataset(BaseDataset): | |
"""Dataset for training MiniGPT4. | |
MiniGPT4 dataset directory: | |
minigpt4_dataset | |
โโโ image | |
โ โโโ id0.jpg | |
โ โโโ id1.jpg | |
โ โโโ id2.jpg | |
โ โโโ ... | |
โโโ conversation_data.json | |
The structure of conversation_data.json: | |
[ | |
// English data | |
{ | |
"id": str(id0), | |
"conversation": "###Ask: <Img><ImageHere></Img> [Ask content] | |
###Answer: [Answer content]" | |
}, | |
// Chinese data | |
{ | |
"id": str(id1), | |
"conversation": "###้ฎ๏ผ<Img><ImageHere></Img> [Ask content] | |
###็ญ๏ผ[Answer content]" | |
}, | |
... | |
] | |
Args: | |
data_root (str): The root directory for ``ann_file`` and ``image``. | |
ann_file (str): Conversation file path. | |
**kwargs: Other keyword arguments in :class:`BaseDataset`. | |
""" | |
def load_data_list(self) -> List[dict]: | |
file_backend = get_file_backend(self.data_root) | |
conversation_path = file_backend.join_path(self.data_root, | |
self.ann_file) | |
conversation = mmengine.load(conversation_path) | |
img_ids = {} | |
n = 0 | |
for conv in conversation: | |
img_id = conv['id'] | |
if img_id not in img_ids.keys(): | |
img_ids[img_id] = n | |
n += 1 | |
img_root = file_backend.join_path(self.data_root, 'image') | |
data_list = [] | |
for conv in conversation: | |
img_file = '{}.jpg'.format(conv['id']) | |
chat_content = conv['conversation'] | |
lang = 'en' if chat_content.startswith('###Ask: ') else 'zh' | |
data_info = { | |
'image_id': img_ids[conv['id']], | |
'img_path': file_backend.join_path(img_root, img_file), | |
'chat_content': chat_content, | |
'lang': lang, | |
} | |
data_list.append(data_info) | |
return data_list | |