sradc commited on
Commit
04848c9
1 Parent(s): 3a8e829

added gitignores when downloading videos/video-ids, and added download_videos.py to run_pipeline.sh

Browse files
pipeline/download_videos.py CHANGED
@@ -17,6 +17,7 @@ def get_id(url: str) -> str:
17
 
18
  def download_videos(video_ids: List[str]) -> None:
19
  VIDEO_DIR.mkdir(exist_ok=True, parents=True)
 
20
  for video_id in tqdm(video_ids):
21
  video_url = f"https://www.youtube.com/watch?v={video_id}"
22
  video_path = VIDEO_DIR / f"{video_id}.mp4"
 
17
 
18
  def download_videos(video_ids: List[str]) -> None:
19
  VIDEO_DIR.mkdir(exist_ok=True, parents=True)
20
+ (VIDEO_DIR / "gitignore").write_text("*")
21
  for video_id in tqdm(video_ids):
22
  video_url = f"https://www.youtube.com/watch?v={video_id}"
23
  video_path = VIDEO_DIR / f"{video_id}.mp4"
pipeline/get_video_ids.py CHANGED
@@ -6,6 +6,8 @@ from typing import Final, Optional
6
 
7
  import youtube_dl
8
 
 
 
9
  logging.basicConfig(
10
  level=logging.INFO,
11
  format="%(asctime)s - %(levelname)s - %(message)s",
@@ -26,7 +28,7 @@ PLAYLIST_URLS = [
26
  "https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj", # 495, MTV Classic 2000's music videos (US Version)
27
  ]
28
  URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE")
29
- OUTPUT_DIR: Final[str] = os.environ.get("OUTPUT_DIR", "data/ids")
30
 
31
 
32
  def get_all_video_ids(channel_url: str) -> list[str]:
@@ -58,12 +60,10 @@ def get_all_video_ids(channel_url: str) -> list[str]:
58
  def process_youtube_url(url: str):
59
  logging.info(f"Processing {url}")
60
  ids = get_all_video_ids(url)
61
-
62
- output_dir = Path(OUTPUT_DIR)
63
- output_dir.mkdir(parents=True, exist_ok=True)
64
-
65
  output = "\n".join(ids)
66
- output_path = output_dir / f"{hashlib.md5(output.encode()).hexdigest()}.txt"
67
  logging.info(f"Writing {len(ids)} video IDs to {output_path}")
68
  with output_path.open(mode="w") as f:
69
  f.write(output)
 
6
 
7
  import youtube_dl
8
 
9
+ from pipeline.download_videos import DATA_DIR
10
+
11
  logging.basicConfig(
12
  level=logging.INFO,
13
  format="%(asctime)s - %(levelname)s - %(message)s",
 
28
  "https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj", # 495, MTV Classic 2000's music videos (US Version)
29
  ]
30
  URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE")
31
+ OUTPUT_DIR: Final[str] = DATA_DIR / "ids"
32
 
33
 
34
  def get_all_video_ids(channel_url: str) -> list[str]:
 
60
  def process_youtube_url(url: str):
61
  logging.info(f"Processing {url}")
62
  ids = get_all_video_ids(url)
63
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
64
+ (OUTPUT_DIR / ".gitignore").write_text("*")
 
 
65
  output = "\n".join(ids)
66
+ output_path = OUTPUT_DIR / f"{hashlib.md5(output.encode()).hexdigest()}.txt"
67
  logging.info(f"Writing {len(ids)} video IDs to {output_path}")
68
  with output_path.open(mode="w") as f:
69
  f.write(output)
run_pipeline.sh CHANGED
@@ -1,5 +1,6 @@
1
  #!/usr/bin/env bash
2
  set -e
3
 
 
4
  poetry run python pipeline/download_videos.py
5
  poetry run python pipeline/process_videos.py
 
1
  #!/usr/bin/env bash
2
  set -e
3
 
4
+ poetry run python pipeline/get_video_ids.py
5
  poetry run python pipeline/download_videos.py
6
  poetry run python pipeline/process_videos.py