Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS
def download_single_example(file_id: str | None = None):
"""
Download a single interaction example (~50-100MB).
Perfect for quick exploration and understanding data structure.
Auto-samples from vendors with smaller files (V00, V01) if no file_id
provided.
:param file_id: Specific file ID to download, or None to auto-sample
"""
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True)
fs = SeamlessInteractionFS(config=config)
if file_id is None:
# Auto-sample a random file ID from preferred vendors
file_ids = fs.sample_random_file_ids(num_samples=1)
file_id = file_ids[0]
print(f"🎲 Auto-sampled file ID: {file_id}")
# Download single interaction
fs.gather_file_id_data_from_s3(file_id)
print(f"✅ Downloaded single example: {file_id}")
def download_interaction_pair(interaction_key: str | None = None):
"""
Download a pair of interactions from the same session (~100-200MB).
Ideal for studying conversational dynamics between participants.
Auto-samples interaction pairs if no interaction_key provided.
:param interaction_key: Interaction key (V00_S0809_I00000582) or None to auto-sample
"""
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True,local_dir = "/root/autodl-tmp/seamless")
fs = SeamlessInteractionFS(config=config)
if interaction_key is None:
# Auto-sample interaction pairs from preferred vendors
pairs = fs.get_interaction_pairs(num_pairs=4)
# 获取num_pairs个不重复的interaction pairs
print(f"✅ 获取了{len(pairs)}个pairs (共{len(pairs)*2}个file IDs)")
# 展平为file_id list
file_ids = [fid for pair in pairs for fid in pair]
print(f"✅ 下载完成")
else:
# Use specific interaction key
pairs = fs.get_interaction_pairs(interaction_keys=[interaction_key])
file_ids = pairs[0] if pairs else []
print(f"📍 Using interaction key: {interaction_key} -> {file_ids}")
if not file_ids:
print("❌ No interaction pairs found")
return
# Download both participants from same interaction
fs.download_batch_from_s3(file_ids)
print(f"✅ Downloaded interaction pair: {file_ids}")
def download_samples_1gb(file_ids: list[str] | None = None, num_samples: int = 10):
"""
Download approximately 1GB of samples (~10 interactions).
Good for initial exploration and prototyping.
Auto-samples diverse interactions if no file_ids provided.
:param file_ids: Specific file IDs to download, or None to auto-sample
:param num_samples: Number of samples to download (if auto-sampling)
"""
config = DatasetConfig(
label="improvised",
split="test",
preferred_vendors_only=True,
seed=42, # For reproducible sampling
num_workers=4,
)
fs = SeamlessInteractionFS(config=config)
if file_ids is None:
# Auto-sample diverse file IDs from preferred vendors
file_ids = fs.sample_random_file_ids(num_samples=num_samples)
print(f"🎲 Auto-sampled {len(file_ids)} file IDs from preferred vendors")
ids_preview = file_ids[:3] if len(file_ids) > 3 else file_ids
print(
f"Sample IDs: {ids_preview}..."
if len(file_ids) > 3
else f"Sample IDs: {ids_preview}"
)
fs.download_batch_from_s3(file_ids)
print(f"✅ Downloaded {len(file_ids)} samples (~{len(file_ids) * 100}MB)")
def download_session_exploration(
session_key: str | None = None, interactions_per_session: int = 4
):
"""
Download complete session groups for deeper exploration (~400MB per session).
Perfect for studying conversational context and session dynamics.
Auto-samples sessions with rich interaction content if no session_key provided.
:param session_key: Session key (V00_S0809) or None to auto-sample
:param interactions_per_session: Target interactions per session
"""
config = DatasetConfig(
label="naturalistic", split="dev", preferred_vendors_only=True, num_workers=4
)
fs = SeamlessInteractionFS(config=config)
if session_key is None:
# Auto-sample session groups from preferred vendors
session_groups = fs.get_session_groups(
num_sessions=1, interactions_per_session=interactions_per_session
)
all_file_ids = session_groups[0] if session_groups else []
print(f"🎲 Auto-sampled session: {len(all_file_ids)} interactions")
else:
# Use specific session key
session_groups = fs.get_session_groups(
session_keys=[session_key],
interactions_per_session=interactions_per_session,
)
all_file_ids = session_groups[0] if session_groups else []
print(
f"📍 Using session key: {session_key} -> {len(all_file_ids)} interactions"
)
if not all_file_ids:
print("❌ No session interactions found")
return
fs.download_batch_from_s3(all_file_ids)
print(f"✅ Downloaded session with {len(all_file_ids)} interactions")
def main():
"""
Demonstrate S3-based flexible download options with intelligent sampling.
All functions support both manual key specification and automatic sampling.
Auto-sampling prioritizes smaller vendors (V00, V01).
"""
print("🔍 S3 Download Options with Intelligent Sampling:")
print("1. Single example (~100MB) - Quick exploration")
print("2. Interaction pair (~200MB) - Conversational dynamics")
print("3. Sample set (~1GB) - Initial prototyping")
print("4. Session exploration (~400MB/session) - Deep context study")
print()
print("💡 All options auto-sample from preferred vendors if no keys provided")
print(" Preferred: V00, V01 (smaller files)")
print(" Avoided: V03 (larger 100MB-800MB videos)")
print()
print("📍 You can also specify exact keys:")
print(" Interaction key: V00_S0809_I00000582")
print(" Session key: V00_S0809")
# Uncomment desired download scenario:
#download_single_example() # Auto-samples if no file_id provided
# download_single_example("V01_S0223_I00000127_P1505") # Specific file
download_interaction_pair() # Auto-samples interaction pairs
# download_interaction_pair("V00_S0809_I00000582") # Specific interaction
# download_samples_1gb() # Auto-samples 10 diverse files
# download_samples_1gb(num_samples=20) # Auto-samples 20 files (~2GB)
# download_session_exploration() # Auto-samples 1 rich session
# download_session_exploration("V00_S0809") # Specific session
if __name__ == "__main__":
main()