File size: 7,080 Bytes
cb2428f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS
def download_single_example(file_id: str | None = None):
"""
Download a single interaction example (~50-100MB).
Perfect for quick exploration and understanding data structure.
Auto-samples from vendors with smaller files (V00, V01) if no file_id
provided.
:param file_id: Specific file ID to download, or None to auto-sample
"""
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True)
fs = SeamlessInteractionFS(config=config)
if file_id is None:
# Auto-sample a random file ID from preferred vendors
file_ids = fs.sample_random_file_ids(num_samples=1)
file_id = file_ids[0]
print(f"🎲 Auto-sampled file ID: {file_id}")
# Download single interaction
fs.gather_file_id_data_from_s3(file_id)
print(f"✅ Downloaded single example: {file_id}")
def download_interaction_pair(interaction_key: str | None = None):
"""
Download a pair of interactions from the same session (~100-200MB).
Ideal for studying conversational dynamics between participants.
Auto-samples interaction pairs if no interaction_key provided.
:param interaction_key: Interaction key (V00_S0809_I00000582) or None to auto-sample
"""
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True,local_dir = "/root/autodl-tmp/seamless")
fs = SeamlessInteractionFS(config=config)
if interaction_key is None:
# Auto-sample interaction pairs from preferred vendors
pairs = fs.get_interaction_pairs(num_pairs=4)
# 获取num_pairs个不重复的interaction pairs
print(f"✅ 获取了{len(pairs)}个pairs (共{len(pairs)*2}个file IDs)")
# 展平为file_id list
file_ids = [fid for pair in pairs for fid in pair]
print(f"✅ 下载完成")
else:
# Use specific interaction key
pairs = fs.get_interaction_pairs(interaction_keys=[interaction_key])
file_ids = pairs[0] if pairs else []
print(f"📍 Using interaction key: {interaction_key} -> {file_ids}")
if not file_ids:
print("❌ No interaction pairs found")
return
# Download both participants from same interaction
fs.download_batch_from_s3(file_ids)
print(f"✅ Downloaded interaction pair: {file_ids}")
def download_samples_1gb(file_ids: list[str] | None = None, num_samples: int = 10):
"""
Download approximately 1GB of samples (~10 interactions).
Good for initial exploration and prototyping.
Auto-samples diverse interactions if no file_ids provided.
:param file_ids: Specific file IDs to download, or None to auto-sample
:param num_samples: Number of samples to download (if auto-sampling)
"""
config = DatasetConfig(
label="improvised",
split="test",
preferred_vendors_only=True,
seed=42, # For reproducible sampling
num_workers=4,
)
fs = SeamlessInteractionFS(config=config)
if file_ids is None:
# Auto-sample diverse file IDs from preferred vendors
file_ids = fs.sample_random_file_ids(num_samples=num_samples)
print(f"🎲 Auto-sampled {len(file_ids)} file IDs from preferred vendors")
ids_preview = file_ids[:3] if len(file_ids) > 3 else file_ids
print(
f"Sample IDs: {ids_preview}..."
if len(file_ids) > 3
else f"Sample IDs: {ids_preview}"
)
fs.download_batch_from_s3(file_ids)
print(f"✅ Downloaded {len(file_ids)} samples (~{len(file_ids) * 100}MB)")
def download_session_exploration(
session_key: str | None = None, interactions_per_session: int = 4
):
"""
Download complete session groups for deeper exploration (~400MB per session).
Perfect for studying conversational context and session dynamics.
Auto-samples sessions with rich interaction content if no session_key provided.
:param session_key: Session key (V00_S0809) or None to auto-sample
:param interactions_per_session: Target interactions per session
"""
config = DatasetConfig(
label="naturalistic", split="dev", preferred_vendors_only=True, num_workers=4
)
fs = SeamlessInteractionFS(config=config)
if session_key is None:
# Auto-sample session groups from preferred vendors
session_groups = fs.get_session_groups(
num_sessions=1, interactions_per_session=interactions_per_session
)
all_file_ids = session_groups[0] if session_groups else []
print(f"🎲 Auto-sampled session: {len(all_file_ids)} interactions")
else:
# Use specific session key
session_groups = fs.get_session_groups(
session_keys=[session_key],
interactions_per_session=interactions_per_session,
)
all_file_ids = session_groups[0] if session_groups else []
print(
f"📍 Using session key: {session_key} -> {len(all_file_ids)} interactions"
)
if not all_file_ids:
print("❌ No session interactions found")
return
fs.download_batch_from_s3(all_file_ids)
print(f"✅ Downloaded session with {len(all_file_ids)} interactions")
def main():
"""
Demonstrate S3-based flexible download options with intelligent sampling.
All functions support both manual key specification and automatic sampling.
Auto-sampling prioritizes smaller vendors (V00, V01).
"""
print("🔍 S3 Download Options with Intelligent Sampling:")
print("1. Single example (~100MB) - Quick exploration")
print("2. Interaction pair (~200MB) - Conversational dynamics")
print("3. Sample set (~1GB) - Initial prototyping")
print("4. Session exploration (~400MB/session) - Deep context study")
print()
print("💡 All options auto-sample from preferred vendors if no keys provided")
print(" Preferred: V00, V01 (smaller files)")
print(" Avoided: V03 (larger 100MB-800MB videos)")
print()
print("📍 You can also specify exact keys:")
print(" Interaction key: V00_S0809_I00000582")
print(" Session key: V00_S0809")
# Uncomment desired download scenario:
#download_single_example() # Auto-samples if no file_id provided
# download_single_example("V01_S0223_I00000127_P1505") # Specific file
download_interaction_pair() # Auto-samples interaction pairs
# download_interaction_pair("V00_S0809_I00000582") # Specific interaction
# download_samples_1gb() # Auto-samples 10 diverse files
# download_samples_1gb(num_samples=20) # Auto-samples 20 files (~2GB)
# download_session_exploration() # Auto-samples 1 rich session
# download_session_exploration("V00_S0809") # Specific session
if __name__ == "__main__":
main()
|