Spaces:
Runtime error
Runtime error
sradc
added gitignores when downloading videos/video-ids, and added download_videos.py to run_pipeline.sh
04848c9
import hashlib | |
import logging | |
import os | |
from pathlib import Path | |
from typing import Final, Optional | |
import youtube_dl | |
from pipeline.download_videos import DATA_DIR | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
datefmt="%Y-%m-%d %H:%M:%S", | |
) | |
PLAYLIST_URLS = [ | |
"https://www.youtube.com/playlist?list=PL6Lt9p1lIRZ311J9ZHuzkR5A3xesae2pk", # 570, Alternative rock of the 2000s (2000-2009) | |
"https://www.youtube.com/playlist?list=PLMC9KNkIncKtGvr2kFRuXBVmBev6cAJ2u", # 250, Best Pop Music Videos - Top Pop Hits Playlist | |
"https://www.youtube.com/playlist?list=PLmXxqSJJq-yXrCPGIT2gn8b34JjOrl4Xf", # 184, 80s Music Hits | Best 80s Music Playlist | |
"https://www.youtube.com/playlist?list=PL7DA3D097D6FDBC02", # 150, 90's Hits - Greatest 1990's Music Hits (Best 90’s Songs Playlist) | |
"https://www.youtube.com/playlist?list=PLeDakahyfrO-4kuBioL5ZAoy4j6aCnzWy", # 100, Best Music Videos of All Time | |
"https://www.youtube.com/playlist?list=PLMC9KNkIncKtPzgY-5rmhvj7fax8fdxoj", # 200, Pop Music Playlist - Timeless Pop Songs (Updated Weekly 2023) | |
"https://www.youtube.com/playlist?list=PLkqz3S84Tw-RfPS9HHi3MRmrinOBKxIr8", # 82, Top POP Hits 2022 – Biggest Pop Music Videos - Vevo | |
"https://www.youtube.com/playlist?list=PLyORnIW1xT6wqvszJbCdLdSjylYMf3sNZ", # 100, Top 100 Music Videos 2023 - Best Music Videos 2023 | |
"https://www.youtube.com/playlist?list=PL1Mmsa-U48mea1oIN-Eus78giJANx4D9W", # 119, 90s Music Videos | |
"https://www.youtube.com/playlist?list=PLurPBtLcqJqcg3r-HOhR3LZ0aDxpI15Fa", # 100, 100 Best Music Videos Of The Decade: 2010 - 2019 | |
"https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj", # 495, MTV Classic 2000's music videos (US Version) | |
] | |
URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE") | |
OUTPUT_DIR: Final[str] = DATA_DIR / "ids" | |
def get_all_video_ids(channel_url: str) -> list[str]: | |
"""Get all video IDs from a YouTube channel or playlist URL. | |
Args: | |
channel_url (str): URL of the YouTube channel or playlist. | |
Returns: | |
list[str]: List of video IDs. | |
Notes: | |
If you want the videos from a channel, make sure to pass the `/videos` endpoint of the channel. | |
""" | |
ydl_opts = { | |
"ignoreerrors": True, | |
"extract_flat": "in_playlist", | |
"dump_single_json": True, | |
"quiet": True, | |
} | |
with youtube_dl.YoutubeDL(ydl_opts) as ydl: | |
playlist_info = ydl.extract_info(channel_url, download=False) | |
video_ids = [video["id"] for video in playlist_info["entries"] if "id" in video] | |
return video_ids | |
def process_youtube_url(url: str): | |
logging.info(f"Processing {url}") | |
ids = get_all_video_ids(url) | |
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
(OUTPUT_DIR / ".gitignore").write_text("*") | |
output = "\n".join(ids) | |
output_path = OUTPUT_DIR / f"{hashlib.md5(output.encode()).hexdigest()}.txt" | |
logging.info(f"Writing {len(ids)} video IDs to {output_path}") | |
with output_path.open(mode="w") as f: | |
f.write(output) | |
def main(): | |
logging.info(f"Processing {len(PLAYLIST_URLS)} URLs") | |
for url in PLAYLIST_URLS: | |
process_youtube_url(url) | |
if __name__ == "__main__": | |
main() | |