interactSpeech / seamless_interaction /scripts /.ipynb_checkpoints /download_bach-checkpoint.py

Add files using upload-large-folder tool

cb2428f verified 3 months ago

4.15 kB

	from pathlib import Path
	from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS

	def download_interaction_pair(local_dir="/root/autodl-tmp/seamless", num_pairs=200):
	"""
	Download multiple interaction pairs with resume support.

	- First run: auto-samples pairs, saves their keys.
	- Subsequent runs: only downloads remaining files.
	"""
	config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True, local_dir=local_dir)
	fs = SeamlessInteractionFS(config=config)

	# 文件路径
	keys_file = Path(local_dir) / "interaction_keys.txt"
	all_file_ids_file = Path(local_dir) / "all_file_ids.txt"
	downloaded_ids_file = Path(local_dir) / "downloaded_file_ids.txt"

	Path(local_dir).mkdir(parents=True, exist_ok=True)

	# 第一次运行: 采样并保存 interaction_keys
	if not keys_file.exists():
	pairs = fs.get_interaction_pairs(num_pairs=num_pairs)
	interaction_keys = [pair[0].rsplit("_", 1)[0] for pair in pairs]
	with keys_file.open("w") as f:
	for k in interaction_keys:
	f.write(k + "\n")
	print(f"✅ 初次采样 {len(interaction_keys)} 个interaction_keys 已保存: {keys_file}")
	else:
	# 后续运行: 从文件加载
	interaction_keys = [line.strip() for line in keys_file.open("r")]
	print(f"✅ 已加载 {len(interaction_keys)} 个interaction_keys: {keys_file}")

	# 将interaction_keys解析成file_ids
	file_ids = []
	for key in interaction_keys:
	pairs = fs.get_interaction_pairs(interaction_keys=[key])
	if pairs:
	file_ids.extend(pairs[0])

	# 保存file_ids到all_file_ids.txt
	if not all_file_ids_file.exists():
	with all_file_ids_file.open("w") as f:
	for fid in file_ids:
	f.write(fid + "\n")
	print(f"✅ 已保存所有file IDs: {all_file_ids_file}")

	# 加载已下载file_id
	downloaded = set()
	if downloaded_ids_file.exists():
	downloaded = set(line.strip() for line in downloaded_ids_file.open())

	# 下载
	for fid in file_ids:
	if fid in downloaded:
	print(f"⏭️ 已下载，跳过: {fid}")
	continue
	try:
	fs.gather_file_id_data_from_s3(fid, local_dir=local_dir)
	print(f"✅ 下载完成: {fid}")
	with downloaded_ids_file.open("a") as f:
	f.write(fid + "\n")
	except Exception as e:
	print(f"❌ 下载失败 {fid}: {e}")

	print("🎉 所有任务完成")




	def main():
	"""
	Demonstrate S3-based flexible download options with intelligent sampling.

	All functions support both manual key specification and automatic sampling.
	Auto-sampling prioritizes smaller vendors (V00, V01).
	"""
	print("🔍 S3 Download Options with Intelligent Sampling:")
	print("1. Single example (~100MB) - Quick exploration")
	print("2. Interaction pair (~200MB) - Conversational dynamics")
	print("3. Sample set (~1GB) - Initial prototyping")
	print("4. Session exploration (~400MB/session) - Deep context study")
	print()
	print("💡 All options auto-sample from preferred vendors if no keys provided")
	print(" Preferred: V00, V01 (smaller files)")
	print(" Avoided: V03 (larger 100MB-800MB videos)")
	print()
	print("📍 You can also specify exact keys:")
	print(" Interaction key: V00_S0809_I00000582")
	print(" Session key: V00_S0809")

	# Uncomment desired download scenario:
	#download_single_example() # Auto-samples if no file_id provided
	# download_single_example("V01_S0223_I00000127_P1505") # Specific file
	download_interaction_pair() # Auto-samples interaction pairs
	# download_interaction_pair("V00_S0809_I00000582") # Specific interaction
	# download_samples_1gb() # Auto-samples 10 diverse files
	# download_samples_1gb(num_samples=20) # Auto-samples 20 files (~2GB)
	# download_session_exploration() # Auto-samples 1 rich session
	# download_session_exploration("V00_S0809") # Specific session


	if __name__ == "__main__":
	main()