File size: 4,671 Bytes
7cdf421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os


class DatasetCatalog:
    def __init__(self):
        # the following dataset utilized for encoding-side alignment learning
        self.audiocap_enc = {
            "target": "dataset.audiocap_dataset.AudioCapDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/audiocap/audiocap.json",
                mm_root_path="../data/T-X_pair_data/audiocap/audios",
                embed_path="../data/embed/",
                dataset_type="AudioToText",
            ),
        }

        self.webvid_enc = {
            "target": "dataset.webvid_dataset.WebvidDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/webvid/webvid.json",
                mm_root_path="../data/T-X_pair_data/webvid/videos",
                embed_path="../data/embed/",
                dataset_type="VideoToText",
            ),
        }

        self.cc3m_enc = {
            "target": "dataset.cc3m_dataset.CC3MDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/cc3m/cc3m.json",
                mm_root_path="../data/T-X_pair_data/cc3m/images",
                embed_path="../data/embed/",
                dataset_type="ImageToText",
            ),
        }

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

        # the following dataset utilized for decoding-side alignment learning.

        self.audiocap_dec = {
            "target": "dataset.audiocap_dataset.AudioCapDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/audiocap/audiocap.json",
                mm_root_path="../data/T-X_pair_data/audiocap/audios",
                embed_path="../data/embed/",
                dataset_type="TextToAudio",
            ),
        }

        self.webvid_dec = {
            "target": "dataset.webvid_dataset.WebvidDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/webvid/webvid.json",
                mm_root_path="../data/T-X_pair_data/webvid/videos",
                embed_path="../data/embed/",
                dataset_type="TextToVideo",
            ),
        }

        self.cc3m_dec = {
            "target": "dataset.cc3m_dataset.CC3MDataset",
            "params": dict(
                data_path="../data/T-X_pair_data/cc3m/cc3m.json",
                mm_root_path="../data/T-X_pair_data/cc3m/images",
                embed_path="../data/embed/",
                dataset_type="TextToImage",
            ),
        }

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

        # the following dataset utilized for instruction tuning, so they are instruction dataset.
        self.audio_instruction = {
            "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T-T+X_data/audio_t2x.json",
                embed_path="./embed/",
                dataset_type="TextToAudio",
            ),
        }

        self.video_instruction = {
            "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T-T+X_data/video_t2x.json",
                embed_path="./embed/",
                dataset_type="TextToVideo",
            ),
        }

        self.image_instruction = {
            "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T-T+X_data/image_t2x.json",
                embed_path="./embed/",
                dataset_type="TextToImage",

            ),
        }

        self.llava_instruction = {
            "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T+X-T_data/llava/llava.json",
                mm_root_path="../data/IT_data/T+X-T_data/llava/images",
                dataset_type="ImageToText",
            ),
        }

        self.alpaca_instruction = {
            "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json",
                dataset_type="TextToText",
            ),
        }

        self.videochat_instruction = {
            "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
            "params": dict(
                data_path="../data/IT_data/T+X-T_data/videochat/videochat.json",
                dataset_type="VideoToText",
            ),
        }