NEXTGPT / code /dataset /catalog.py
osamaifti's picture
Upload 83 files
7cdf421 verified
raw
history blame contribute delete
No virus
4.67 kB
import os
class DatasetCatalog:
def __init__(self):
# the following dataset utilized for encoding-side alignment learning
self.audiocap_enc = {
"target": "dataset.audiocap_dataset.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="AudioToText",
),
}
self.webvid_enc = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="VideoToText",
),
}
self.cc3m_enc = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="ImageToText",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for decoding-side alignment learning.
self.audiocap_dec = {
"target": "dataset.audiocap_dataset.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="TextToAudio",
),
}
self.webvid_dec = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="TextToVideo",
),
}
self.cc3m_dec = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="TextToImage",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for instruction tuning, so they are instruction dataset.
self.audio_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/audio_t2x.json",
embed_path="./embed/",
dataset_type="TextToAudio",
),
}
self.video_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/video_t2x.json",
embed_path="./embed/",
dataset_type="TextToVideo",
),
}
self.image_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/image_t2x.json",
embed_path="./embed/",
dataset_type="TextToImage",
),
}
self.llava_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/llava/llava.json",
mm_root_path="../data/IT_data/T+X-T_data/llava/images",
dataset_type="ImageToText",
),
}
self.alpaca_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json",
dataset_type="TextToText",
),
}
self.videochat_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/videochat/videochat.json",
dataset_type="VideoToText",
),
}