|
|
from pathlib import Path |
|
|
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS |
|
|
|
|
|
def download_interaction_pair(local_dir="/root/autodl-tmp/seamless", num_pairs=200): |
|
|
""" |
|
|
Download multiple interaction pairs with resume support. |
|
|
|
|
|
- First run: auto-samples pairs, saves their keys. |
|
|
- Subsequent runs: only downloads remaining files. |
|
|
""" |
|
|
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True, local_dir=local_dir) |
|
|
fs = SeamlessInteractionFS(config=config) |
|
|
|
|
|
|
|
|
keys_file = Path(local_dir) / "interaction_keys.txt" |
|
|
all_file_ids_file = Path(local_dir) / "all_file_ids.txt" |
|
|
downloaded_ids_file = Path(local_dir) / "downloaded_file_ids.txt" |
|
|
|
|
|
Path(local_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
if not keys_file.exists(): |
|
|
pairs = fs.get_interaction_pairs(num_pairs=num_pairs) |
|
|
interaction_keys = [pair[0].rsplit("_", 1)[0] for pair in pairs] |
|
|
with keys_file.open("w") as f: |
|
|
for k in interaction_keys: |
|
|
f.write(k + "\n") |
|
|
print(f"✅ 初次采样 {len(interaction_keys)} 个interaction_keys 已保存: {keys_file}") |
|
|
else: |
|
|
|
|
|
interaction_keys = [line.strip() for line in keys_file.open("r")] |
|
|
print(f"✅ 已加载 {len(interaction_keys)} 个interaction_keys: {keys_file}") |
|
|
|
|
|
|
|
|
file_ids = [] |
|
|
for key in interaction_keys: |
|
|
pairs = fs.get_interaction_pairs(interaction_keys=[key]) |
|
|
if pairs: |
|
|
file_ids.extend(pairs[0]) |
|
|
|
|
|
|
|
|
if not all_file_ids_file.exists(): |
|
|
with all_file_ids_file.open("w") as f: |
|
|
for fid in file_ids: |
|
|
f.write(fid + "\n") |
|
|
print(f"✅ 已保存所有file IDs: {all_file_ids_file}") |
|
|
|
|
|
|
|
|
downloaded = set() |
|
|
if downloaded_ids_file.exists(): |
|
|
downloaded = set(line.strip() for line in downloaded_ids_file.open()) |
|
|
|
|
|
|
|
|
for fid in file_ids: |
|
|
if fid in downloaded: |
|
|
print(f"⏭️ 已下载,跳过: {fid}") |
|
|
continue |
|
|
try: |
|
|
fs.gather_file_id_data_from_s3(fid, local_dir=local_dir) |
|
|
print(f"✅ 下载完成: {fid}") |
|
|
with downloaded_ids_file.open("a") as f: |
|
|
f.write(fid + "\n") |
|
|
except Exception as e: |
|
|
print(f"❌ 下载失败 {fid}: {e}") |
|
|
|
|
|
print("🎉 所有任务完成") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
""" |
|
|
Demonstrate S3-based flexible download options with intelligent sampling. |
|
|
|
|
|
All functions support both manual key specification and automatic sampling. |
|
|
Auto-sampling prioritizes smaller vendors (V00, V01). |
|
|
""" |
|
|
print("🔍 S3 Download Options with Intelligent Sampling:") |
|
|
print("1. Single example (~100MB) - Quick exploration") |
|
|
print("2. Interaction pair (~200MB) - Conversational dynamics") |
|
|
print("3. Sample set (~1GB) - Initial prototyping") |
|
|
print("4. Session exploration (~400MB/session) - Deep context study") |
|
|
print() |
|
|
print("💡 All options auto-sample from preferred vendors if no keys provided") |
|
|
print(" Preferred: V00, V01 (smaller files)") |
|
|
print(" Avoided: V03 (larger 100MB-800MB videos)") |
|
|
print() |
|
|
print("📍 You can also specify exact keys:") |
|
|
print(" Interaction key: V00_S0809_I00000582") |
|
|
print(" Session key: V00_S0809") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
download_interaction_pair() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|