File size: 7,080 Bytes
cb2428f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS


def download_single_example(file_id: str | None = None):
    """
    Download a single interaction example (~50-100MB).

    Perfect for quick exploration and understanding data structure.
    Auto-samples from vendors with smaller files (V00, V01) if no file_id
    provided.

    :param file_id: Specific file ID to download, or None to auto-sample
    """
    config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True)
    fs = SeamlessInteractionFS(config=config)

    if file_id is None:
        # Auto-sample a random file ID from preferred vendors
        file_ids = fs.sample_random_file_ids(num_samples=1)
        file_id = file_ids[0]
        print(f"🎲 Auto-sampled file ID: {file_id}")

    # Download single interaction
    fs.gather_file_id_data_from_s3(file_id)
    print(f"✅ Downloaded single example: {file_id}")


def download_interaction_pair(interaction_key: str | None = None):
    """
    Download a pair of interactions from the same session (~100-200MB).

    Ideal for studying conversational dynamics between participants.
    Auto-samples interaction pairs if no interaction_key provided.

    :param interaction_key: Interaction key (V00_S0809_I00000582) or None to auto-sample
    """
    config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True,local_dir = "/root/autodl-tmp/seamless")
    fs = SeamlessInteractionFS(config=config)

    if interaction_key is None:
        # Auto-sample interaction pairs from preferred vendors
        pairs = fs.get_interaction_pairs(num_pairs=4)
         # 获取num_pairs个不重复的interaction pairs
        print(f"✅ 获取了{len(pairs)}个pairs (共{len(pairs)*2}个file IDs)")
    
        # 展平为file_id list
        file_ids = [fid for pair in pairs for fid in pair]
    
        print(f"✅ 下载完成")
    else:
        # Use specific interaction key
        pairs = fs.get_interaction_pairs(interaction_keys=[interaction_key])
        file_ids = pairs[0] if pairs else []
        print(f"📍 Using interaction key: {interaction_key} -> {file_ids}")

    if not file_ids:
        print("❌ No interaction pairs found")
        return

    # Download both participants from same interaction
    fs.download_batch_from_s3(file_ids)
    print(f"✅ Downloaded interaction pair: {file_ids}")


def download_samples_1gb(file_ids: list[str] | None = None, num_samples: int = 10):
    """
    Download approximately 1GB of samples (~10 interactions).

    Good for initial exploration and prototyping.
    Auto-samples diverse interactions if no file_ids provided.

    :param file_ids: Specific file IDs to download, or None to auto-sample
    :param num_samples: Number of samples to download (if auto-sampling)
    """
    config = DatasetConfig(
        label="improvised",
        split="test",
        preferred_vendors_only=True,
        seed=42,  # For reproducible sampling
        num_workers=4,
    )
    fs = SeamlessInteractionFS(config=config)

    if file_ids is None:
        # Auto-sample diverse file IDs from preferred vendors
        file_ids = fs.sample_random_file_ids(num_samples=num_samples)
        print(f"🎲 Auto-sampled {len(file_ids)} file IDs from preferred vendors")
        ids_preview = file_ids[:3] if len(file_ids) > 3 else file_ids
        print(
            f"Sample IDs: {ids_preview}..."
            if len(file_ids) > 3
            else f"Sample IDs: {ids_preview}"
        )

    fs.download_batch_from_s3(file_ids)
    print(f"✅ Downloaded {len(file_ids)} samples (~{len(file_ids) * 100}MB)")


def download_session_exploration(
    session_key: str | None = None, interactions_per_session: int = 4
):
    """
    Download complete session groups for deeper exploration (~400MB per session).

    Perfect for studying conversational context and session dynamics.
    Auto-samples sessions with rich interaction content if no session_key provided.

    :param session_key: Session key (V00_S0809) or None to auto-sample
    :param interactions_per_session: Target interactions per session
    """
    config = DatasetConfig(
        label="naturalistic", split="dev", preferred_vendors_only=True, num_workers=4
    )
    fs = SeamlessInteractionFS(config=config)

    if session_key is None:
        # Auto-sample session groups from preferred vendors
        session_groups = fs.get_session_groups(
            num_sessions=1, interactions_per_session=interactions_per_session
        )
        all_file_ids = session_groups[0] if session_groups else []
        print(f"🎲 Auto-sampled session: {len(all_file_ids)} interactions")
    else:
        # Use specific session key
        session_groups = fs.get_session_groups(
            session_keys=[session_key],
            interactions_per_session=interactions_per_session,
        )
        all_file_ids = session_groups[0] if session_groups else []
        print(
            f"📍 Using session key: {session_key} -> {len(all_file_ids)} interactions"
        )

    if not all_file_ids:
        print("❌ No session interactions found")
        return

    fs.download_batch_from_s3(all_file_ids)
    print(f"✅ Downloaded session with {len(all_file_ids)} interactions")


def main():
    """
    Demonstrate S3-based flexible download options with intelligent sampling.

    All functions support both manual key specification and automatic sampling.
    Auto-sampling prioritizes smaller vendors (V00, V01).
    """
    print("🔍 S3 Download Options with Intelligent Sampling:")
    print("1. Single example (~100MB) - Quick exploration")
    print("2. Interaction pair (~200MB) - Conversational dynamics")
    print("3. Sample set (~1GB) - Initial prototyping")
    print("4. Session exploration (~400MB/session) - Deep context study")
    print()
    print("💡 All options auto-sample from preferred vendors if no keys provided")
    print("   Preferred: V00, V01 (smaller files)")
    print("   Avoided: V03 (larger 100MB-800MB videos)")
    print()
    print("📍 You can also specify exact keys:")
    print("   Interaction key: V00_S0809_I00000582")
    print("   Session key: V00_S0809")

    # Uncomment desired download scenario:
    #download_single_example()  # Auto-samples if no file_id provided
    # download_single_example("V01_S0223_I00000127_P1505")  # Specific file
    download_interaction_pair()  # Auto-samples interaction pairs
    # download_interaction_pair("V00_S0809_I00000582")  # Specific interaction
    # download_samples_1gb()  # Auto-samples 10 diverse files
    # download_samples_1gb(num_samples=20)  # Auto-samples 20 files (~2GB)
    # download_session_exploration()  # Auto-samples 1 rich session
    # download_session_exploration("V00_S0809")  # Specific session


if __name__ == "__main__":
    main()