Spaces:
Runtime error
Runtime error
import time | |
from dataclasses import asdict | |
import pandas as pd | |
import wandb | |
from langchain.document_loaders import YoutubeLoader | |
from pytube import Playlist, YouTube | |
from tqdm import tqdm | |
from config import config | |
def retry_access_yt_object(url, max_retries=5, interval_secs=5): | |
""" | |
Retries creating a YouTube object with the given URL and accessing its title several times | |
with a given interval in seconds, until it succeeds or the maximum number of attempts is reached. | |
If the object still cannot be created or the title cannot be accessed after the maximum number | |
of attempts, the last exception is raised. | |
""" | |
last_exception = None | |
for i in range(max_retries): | |
try: | |
yt = YouTube(url) | |
title = yt.title # Access the title of the YouTube object. | |
return yt # Return the YouTube object if successful. | |
except Exception as err: | |
last_exception = err # Keep track of the last exception raised. | |
print( | |
f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})" | |
) | |
time.sleep(interval_secs) # Wait for the specified interval before retrying. | |
# If the YouTube object still cannot be created or the title cannot be accessed after the maximum number of attempts, raise the last exception. | |
raise last_exception | |
if __name__ == "__main__": | |
run = wandb.init(project=config.project_name, job_type="dataset", config=asdict(config)) | |
playlist = Playlist(config.playlist_url) | |
playlist_video_urls = playlist.video_urls | |
print(f"There are total {len(playlist_video_urls)} videos in the playlist.") | |
video_data = [] | |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)): | |
try: | |
curr_video_data = {} | |
yt = retry_access_yt_object(video, max_retries=25, interval_secs=2) | |
curr_video_data["title"] = yt.title | |
curr_video_data["url"] = video | |
curr_video_data["duration"] = yt.length | |
curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d") | |
loader = YoutubeLoader.from_youtube_url(video) | |
transcript = loader.load()[0].page_content | |
transcript = " ".join(transcript.split()) | |
curr_video_data["transcript"] = transcript | |
curr_video_data["total_words"] = len(transcript.split()) | |
video_data.append(curr_video_data) | |
except: | |
print(f"Failed to scrape {video}") | |
print(f"Total podcast episodes scraped: {len(video_data)}") | |
# save the scraped data to a csv file | |
df = pd.DataFrame(video_data) | |
data_path = config.root_data_dir / "yt_podcast_transcript.csv" | |
df.to_csv(data_path, index=False) | |
# upload the scraped data to wandb | |
artifact = wandb.Artifact("yt_podcast_transcript", type="dataset") | |
artifact.add_file(data_path) | |
run.log_artifact(artifact) | |
# create wandb table | |
table = wandb.Table(dataframe=df) | |
run.log({"yt_podcast_transcript": table}) | |
run.finish() | |