Gladiator commited on
Commit
dd20405
1 Parent(s): 36ed070

minor changes

Browse files
Files changed (2) hide show
  1. src/config.py +4 -5
  2. src/podcast_data.py +6 -4
src/config.py CHANGED
@@ -1,4 +1,5 @@
1
  from dataclasses import dataclass
 
2
 
3
 
4
  @dataclass
@@ -6,11 +7,9 @@ class Config:
6
  playlist_url: str = "https://www.youtube.com/playlist?list=PLD80i8An1OEEb1jP0sjEyiLG8ULRXFob_"
7
 
8
  # paths
9
- root_data_dir: str = "../data"
10
- yt_scraped_data_path: str = "../data/yt_data.csv"
11
- chromadb_dir: str = "../data/chromadb"
12
-
13
- # artifacts
14
  yt_podcast_data_artifact: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest"
15
  summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
16
  summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
 
1
  from dataclasses import dataclass
2
+ from pathlib import Path
3
 
4
 
5
  @dataclass
 
7
  playlist_url: str = "https://www.youtube.com/playlist?list=PLD80i8An1OEEb1jP0sjEyiLG8ULRXFob_"
8
 
9
  # paths
10
+ root_data_dir: Path = Path("data")
11
+ # wandb
12
+ project_name: str = "gradient_dissent_qabot"
 
 
13
  yt_podcast_data_artifact: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest"
14
  summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
15
  summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
src/podcast_data.py CHANGED
@@ -2,11 +2,11 @@ import time
2
  from dataclasses import asdict
3
 
4
  import pandas as pd
 
5
  from langchain.document_loaders import YoutubeLoader
6
  from pytube import Playlist, YouTube
7
  from tqdm import tqdm
8
 
9
- import wandb
10
  from config import config
11
 
12
 
@@ -35,7 +35,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
35
 
36
 
37
  if __name__ == "__main__":
38
- run = wandb.init(project="gradient_dissent_bot", job_type="dataset", config=asdict(config))
39
 
40
  playlist = Playlist(config.playlist_url)
41
  playlist_video_urls = playlist.video_urls
@@ -62,10 +62,12 @@ if __name__ == "__main__":
62
 
63
  print(f"Total podcast episodes scraped: {len(video_data)}")
64
 
 
65
  df = pd.DataFrame(video_data)
66
- df.to_csv(config.yt_scraped_data_path, index=False)
67
 
68
- artifact = wandb.Artifact("yt_podcast_data", type="dataset")
 
69
  artifact.add_file(config.yt_scraped_data_path)
70
  run.log_artifact(artifact)
71
 
 
2
  from dataclasses import asdict
3
 
4
  import pandas as pd
5
+ import wandb
6
  from langchain.document_loaders import YoutubeLoader
7
  from pytube import Playlist, YouTube
8
  from tqdm import tqdm
9
 
 
10
  from config import config
11
 
12
 
 
35
 
36
 
37
  if __name__ == "__main__":
38
+ run = wandb.init(project=config.project_name, job_type="dataset", config=asdict(config))
39
 
40
  playlist = Playlist(config.playlist_url)
41
  playlist_video_urls = playlist.video_urls
 
62
 
63
  print(f"Total podcast episodes scraped: {len(video_data)}")
64
 
65
+ # save the scraped data to a csv file
66
  df = pd.DataFrame(video_data)
67
+ df.to_csv(config.root_data_dir / "yt_podcast_transcript.csv", index=False)
68
 
69
+ # upload the scraped data to wandb
70
+ artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
71
  artifact.add_file(config.yt_scraped_data_path)
72
  run.log_artifact(artifact)
73