Spaces:
Runtime error
Runtime error
minor changes
Browse files- data/yt_podcast_transcript.csv +0 -0
- src/podcast_data.py +8 -4
data/yt_podcast_transcript.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/podcast_data.py
CHANGED
@@ -46,10 +46,10 @@ if __name__ == "__main__":
|
|
46 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
47 |
try:
|
48 |
curr_video_data = {}
|
49 |
-
yt = retry_access_yt_object(video, max_retries=
|
50 |
curr_video_data["title"] = yt.title
|
51 |
curr_video_data["url"] = video
|
52 |
-
curr_video_data["
|
53 |
curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d")
|
54 |
loader = YoutubeLoader.from_youtube_url(video)
|
55 |
transcript = loader.load()[0].page_content
|
@@ -64,11 +64,15 @@ if __name__ == "__main__":
|
|
64 |
|
65 |
# save the scraped data to a csv file
|
66 |
df = pd.DataFrame(video_data)
|
67 |
-
|
|
|
68 |
|
69 |
# upload the scraped data to wandb
|
70 |
artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
|
71 |
-
artifact.add_file(
|
72 |
run.log_artifact(artifact)
|
73 |
|
|
|
|
|
|
|
74 |
run.finish()
|
|
|
46 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
47 |
try:
|
48 |
curr_video_data = {}
|
49 |
+
yt = retry_access_yt_object(video, max_retries=25, interval_secs=2)
|
50 |
curr_video_data["title"] = yt.title
|
51 |
curr_video_data["url"] = video
|
52 |
+
curr_video_data["duration"] = yt.length
|
53 |
curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d")
|
54 |
loader = YoutubeLoader.from_youtube_url(video)
|
55 |
transcript = loader.load()[0].page_content
|
|
|
64 |
|
65 |
# save the scraped data to a csv file
|
66 |
df = pd.DataFrame(video_data)
|
67 |
+
data_path = config.root_data_dir / "yt_podcast_transcript.csv"
|
68 |
+
df.to_csv(data_path, index=False)
|
69 |
|
70 |
# upload the scraped data to wandb
|
71 |
artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
|
72 |
+
artifact.add_file(data_path)
|
73 |
run.log_artifact(artifact)
|
74 |
|
75 |
+
# create wandb table
|
76 |
+
table = wandb.Table(dataframe=df)
|
77 |
+
run.log({"yt_podcast_transcript": table})
|
78 |
run.finish()
|