|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS |
|
|
|
|
|
|
|
|
def download_single_example(file_id: str | None = None): |
|
|
""" |
|
|
Download a single interaction example (~50-100MB). |
|
|
|
|
|
Perfect for quick exploration and understanding data structure. |
|
|
Auto-samples from vendors with smaller files (V00, V01) if no file_id |
|
|
provided. |
|
|
|
|
|
:param file_id: Specific file ID to download, or None to auto-sample |
|
|
""" |
|
|
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True) |
|
|
fs = SeamlessInteractionFS(config=config) |
|
|
|
|
|
if file_id is None: |
|
|
|
|
|
file_ids = fs.sample_random_file_ids(num_samples=1) |
|
|
file_id = file_ids[0] |
|
|
print(f"🎲 Auto-sampled file ID: {file_id}") |
|
|
|
|
|
|
|
|
fs.gather_file_id_data_from_s3(file_id) |
|
|
print(f"✅ Downloaded single example: {file_id}") |
|
|
|
|
|
|
|
|
def download_interaction_pair(interaction_key: str | None = None): |
|
|
""" |
|
|
Download a pair of interactions from the same session (~100-200MB). |
|
|
|
|
|
Ideal for studying conversational dynamics between participants. |
|
|
Auto-samples interaction pairs if no interaction_key provided. |
|
|
|
|
|
:param interaction_key: Interaction key (V00_S0809_I00000582) or None to auto-sample |
|
|
""" |
|
|
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True,local_dir = "/root/autodl-tmp/seamless") |
|
|
fs = SeamlessInteractionFS(config=config) |
|
|
|
|
|
if interaction_key is None: |
|
|
|
|
|
pairs = fs.get_interaction_pairs(num_pairs=4) |
|
|
|
|
|
print(f"✅ 获取了{len(pairs)}个pairs (共{len(pairs)*2}个file IDs)") |
|
|
|
|
|
|
|
|
file_ids = [fid for pair in pairs for fid in pair] |
|
|
|
|
|
print(f"✅ 下载完成") |
|
|
else: |
|
|
|
|
|
pairs = fs.get_interaction_pairs(interaction_keys=[interaction_key]) |
|
|
file_ids = pairs[0] if pairs else [] |
|
|
print(f"📍 Using interaction key: {interaction_key} -> {file_ids}") |
|
|
|
|
|
if not file_ids: |
|
|
print("❌ No interaction pairs found") |
|
|
return |
|
|
|
|
|
|
|
|
fs.download_batch_from_s3(file_ids) |
|
|
print(f"✅ Downloaded interaction pair: {file_ids}") |
|
|
|
|
|
|
|
|
def download_samples_1gb(file_ids: list[str] | None = None, num_samples: int = 10): |
|
|
""" |
|
|
Download approximately 1GB of samples (~10 interactions). |
|
|
|
|
|
Good for initial exploration and prototyping. |
|
|
Auto-samples diverse interactions if no file_ids provided. |
|
|
|
|
|
:param file_ids: Specific file IDs to download, or None to auto-sample |
|
|
:param num_samples: Number of samples to download (if auto-sampling) |
|
|
""" |
|
|
config = DatasetConfig( |
|
|
label="improvised", |
|
|
split="test", |
|
|
preferred_vendors_only=True, |
|
|
seed=42, |
|
|
num_workers=4, |
|
|
) |
|
|
fs = SeamlessInteractionFS(config=config) |
|
|
|
|
|
if file_ids is None: |
|
|
|
|
|
file_ids = fs.sample_random_file_ids(num_samples=num_samples) |
|
|
print(f"🎲 Auto-sampled {len(file_ids)} file IDs from preferred vendors") |
|
|
ids_preview = file_ids[:3] if len(file_ids) > 3 else file_ids |
|
|
print( |
|
|
f"Sample IDs: {ids_preview}..." |
|
|
if len(file_ids) > 3 |
|
|
else f"Sample IDs: {ids_preview}" |
|
|
) |
|
|
|
|
|
fs.download_batch_from_s3(file_ids) |
|
|
print(f"✅ Downloaded {len(file_ids)} samples (~{len(file_ids) * 100}MB)") |
|
|
|
|
|
|
|
|
def download_session_exploration( |
|
|
session_key: str | None = None, interactions_per_session: int = 4 |
|
|
): |
|
|
""" |
|
|
Download complete session groups for deeper exploration (~400MB per session). |
|
|
|
|
|
Perfect for studying conversational context and session dynamics. |
|
|
Auto-samples sessions with rich interaction content if no session_key provided. |
|
|
|
|
|
:param session_key: Session key (V00_S0809) or None to auto-sample |
|
|
:param interactions_per_session: Target interactions per session |
|
|
""" |
|
|
config = DatasetConfig( |
|
|
label="naturalistic", split="dev", preferred_vendors_only=True, num_workers=4 |
|
|
) |
|
|
fs = SeamlessInteractionFS(config=config) |
|
|
|
|
|
if session_key is None: |
|
|
|
|
|
session_groups = fs.get_session_groups( |
|
|
num_sessions=1, interactions_per_session=interactions_per_session |
|
|
) |
|
|
all_file_ids = session_groups[0] if session_groups else [] |
|
|
print(f"🎲 Auto-sampled session: {len(all_file_ids)} interactions") |
|
|
else: |
|
|
|
|
|
session_groups = fs.get_session_groups( |
|
|
session_keys=[session_key], |
|
|
interactions_per_session=interactions_per_session, |
|
|
) |
|
|
all_file_ids = session_groups[0] if session_groups else [] |
|
|
print( |
|
|
f"📍 Using session key: {session_key} -> {len(all_file_ids)} interactions" |
|
|
) |
|
|
|
|
|
if not all_file_ids: |
|
|
print("❌ No session interactions found") |
|
|
return |
|
|
|
|
|
fs.download_batch_from_s3(all_file_ids) |
|
|
print(f"✅ Downloaded session with {len(all_file_ids)} interactions") |
|
|
|
|
|
|
|
|
def main(): |
|
|
""" |
|
|
Demonstrate S3-based flexible download options with intelligent sampling. |
|
|
|
|
|
All functions support both manual key specification and automatic sampling. |
|
|
Auto-sampling prioritizes smaller vendors (V00, V01). |
|
|
""" |
|
|
print("🔍 S3 Download Options with Intelligent Sampling:") |
|
|
print("1. Single example (~100MB) - Quick exploration") |
|
|
print("2. Interaction pair (~200MB) - Conversational dynamics") |
|
|
print("3. Sample set (~1GB) - Initial prototyping") |
|
|
print("4. Session exploration (~400MB/session) - Deep context study") |
|
|
print() |
|
|
print("💡 All options auto-sample from preferred vendors if no keys provided") |
|
|
print(" Preferred: V00, V01 (smaller files)") |
|
|
print(" Avoided: V03 (larger 100MB-800MB videos)") |
|
|
print() |
|
|
print("📍 You can also specify exact keys:") |
|
|
print(" Interaction key: V00_S0809_I00000582") |
|
|
print(" Session key: V00_S0809") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
download_interaction_pair() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|