Gladiator commited on
Commit
bc273b1
1 Parent(s): dd20405

minor changes

Browse files
data/yt_podcast_transcript.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/podcast_data.py CHANGED
@@ -46,10 +46,10 @@ if __name__ == "__main__":
46
  for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
47
  try:
48
  curr_video_data = {}
49
- yt = retry_access_yt_object(video, max_retries=20, interval_secs=2)
50
  curr_video_data["title"] = yt.title
51
  curr_video_data["url"] = video
52
- curr_video_data["length"] = yt.length
53
  curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d")
54
  loader = YoutubeLoader.from_youtube_url(video)
55
  transcript = loader.load()[0].page_content
@@ -64,11 +64,15 @@ if __name__ == "__main__":
64
 
65
  # save the scraped data to a csv file
66
  df = pd.DataFrame(video_data)
67
- df.to_csv(config.root_data_dir / "yt_podcast_transcript.csv", index=False)
 
68
 
69
  # upload the scraped data to wandb
70
  artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
71
- artifact.add_file(config.yt_scraped_data_path)
72
  run.log_artifact(artifact)
73
 
 
 
 
74
  run.finish()
 
46
  for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
47
  try:
48
  curr_video_data = {}
49
+ yt = retry_access_yt_object(video, max_retries=25, interval_secs=2)
50
  curr_video_data["title"] = yt.title
51
  curr_video_data["url"] = video
52
+ curr_video_data["duration"] = yt.length
53
  curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d")
54
  loader = YoutubeLoader.from_youtube_url(video)
55
  transcript = loader.load()[0].page_content
 
64
 
65
  # save the scraped data to a csv file
66
  df = pd.DataFrame(video_data)
67
+ data_path = config.root_data_dir / "yt_podcast_transcript.csv"
68
+ df.to_csv(data_path, index=False)
69
 
70
  # upload the scraped data to wandb
71
  artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
72
+ artifact.add_file(data_path)
73
  run.log_artifact(artifact)
74
 
75
+ # create wandb table
76
+ table = wandb.Table(dataframe=df)
77
+ run.log({"yt_podcast_transcript": table})
78
  run.finish()