Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
from pathlib import Path
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS
def download_interaction_pair(local_dir="/root/autodl-tmp/seamless", num_pairs=200):
"""
Download multiple interaction pairs with resume support.
- First run: auto-samples pairs, saves their keys.
- Subsequent runs: only downloads remaining files.
"""
config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True, local_dir=local_dir)
fs = SeamlessInteractionFS(config=config)
# 文件路径
keys_file = Path(local_dir) / "interaction_keys.txt"
all_file_ids_file = Path(local_dir) / "all_file_ids.txt"
downloaded_ids_file = Path(local_dir) / "downloaded_file_ids.txt"
Path(local_dir).mkdir(parents=True, exist_ok=True)
# 第一次运行: 采样并保存 interaction_keys
if not keys_file.exists():
pairs = fs.get_interaction_pairs(num_pairs=num_pairs)
interaction_keys = [pair[0].rsplit("_", 1)[0] for pair in pairs]
with keys_file.open("w") as f:
for k in interaction_keys:
f.write(k + "\n")
print(f"✅ 初次采样 {len(interaction_keys)} 个interaction_keys 已保存: {keys_file}")
else:
# 后续运行: 从文件加载
interaction_keys = [line.strip() for line in keys_file.open("r")]
print(f"✅ 已加载 {len(interaction_keys)} 个interaction_keys: {keys_file}")
# 将interaction_keys解析成file_ids
file_ids = []
for key in interaction_keys:
pairs = fs.get_interaction_pairs(interaction_keys=[key])
if pairs:
file_ids.extend(pairs[0])
# 保存file_ids到all_file_ids.txt
if not all_file_ids_file.exists():
with all_file_ids_file.open("w") as f:
for fid in file_ids:
f.write(fid + "\n")
print(f"✅ 已保存所有file IDs: {all_file_ids_file}")
# 加载已下载file_id
downloaded = set()
if downloaded_ids_file.exists():
downloaded = set(line.strip() for line in downloaded_ids_file.open())
# 下载
for fid in file_ids:
if fid in downloaded:
print(f"⏭️ 已下载,跳过: {fid}")
continue
try:
fs.gather_file_id_data_from_s3(fid, local_dir=local_dir)
print(f"✅ 下载完成: {fid}")
with downloaded_ids_file.open("a") as f:
f.write(fid + "\n")
except Exception as e:
print(f"❌ 下载失败 {fid}: {e}")
print("🎉 所有任务完成")
def main():
"""
Demonstrate S3-based flexible download options with intelligent sampling.
All functions support both manual key specification and automatic sampling.
Auto-sampling prioritizes smaller vendors (V00, V01).
"""
print("🔍 S3 Download Options with Intelligent Sampling:")
print("1. Single example (~100MB) - Quick exploration")
print("2. Interaction pair (~200MB) - Conversational dynamics")
print("3. Sample set (~1GB) - Initial prototyping")
print("4. Session exploration (~400MB/session) - Deep context study")
print()
print("💡 All options auto-sample from preferred vendors if no keys provided")
print(" Preferred: V00, V01 (smaller files)")
print(" Avoided: V03 (larger 100MB-800MB videos)")
print()
print("📍 You can also specify exact keys:")
print(" Interaction key: V00_S0809_I00000582")
print(" Session key: V00_S0809")
# Uncomment desired download scenario:
#download_single_example() # Auto-samples if no file_id provided
# download_single_example("V01_S0223_I00000127_P1505") # Specific file
download_interaction_pair() # Auto-samples interaction pairs
# download_interaction_pair("V00_S0809_I00000582") # Specific interaction
# download_samples_1gb() # Auto-samples 10 diverse files
# download_samples_1gb(num_samples=20) # Auto-samples 20 files (~2GB)
# download_session_exploration() # Auto-samples 1 rich session
# download_session_exploration("V00_S0809") # Specific session
if __name__ == "__main__":
main()