import re import subprocess from pathlib import Path from typing import List from tqdm import tqdm REPO_ROOT = Path(__file__).parents[1].resolve() DATA_DIR = REPO_ROOT / "data" VIDEO_DIR = DATA_DIR / "videos" VIDEO_ID_FOLDER = DATA_DIR / "ids" def get_id(url: str) -> str: return re.search(r"(?<=v=)[^&]+", url).group(0) def download_videos(video_ids: List[str]) -> None: VIDEO_DIR.mkdir(exist_ok=True, parents=True) (VIDEO_DIR / ".gitignore").write_text("*") for video_id in tqdm(video_ids): video_url = f"https://www.youtube.com/watch?v={video_id}" video_path = VIDEO_DIR / f"{video_id}.mp4" if video_path.exists(): print(f"Skipping {video_path} because it already exists") continue subprocess.run( ["yt-dlp", "--quiet", "-f", "135", "-o", str(video_path), video_url] ) if __name__ == "__main__": print("Downloading videos...") ids = set() for file in VIDEO_ID_FOLDER.glob("*.txt"): ids.update( [x for x in file.read_text().strip().splitlines(keepends=False) if x] ) download_videos(ids)