import csv import re import os import yt_dlp import whisper from tqdm import tqdm import ssl ssl._create_default_https_context = ssl._create_unverified_context # Folder to save audio and transcripts audio_folder = 'audio_files' transcripts_folder = 'transcripts' def sanitize_filename(filename): """ Removes invalid characters from filename and truncates it if it's too long. """ s = re.sub(r'[\\/*?:"<>|]', '', filename) if len(s) > 200: s = s[:200] return s def download_audio_with_yt_dlp(video_id): ydl_opts = { 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3' }], 'outtmpl': f'{audio_folder}/{video_id}.%(ext)s' } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([f'https://www.youtube.com/watch?v={video_id}']) def transcribe_audio_with_whisper(audio_file): model = whisper.load_model("base") result = model.transcribe(audio_file) return result["text"] def process_video(video_id): audio_file = f"{audio_folder}/{video_id}.mp3" # Download audio download_audio_with_yt_dlp(video_id) # Transcribe audio transcript = transcribe_audio_with_whisper(audio_file) # Save the transcript with open(f"{transcripts_folder}/{video_id}.txt", 'w', encoding="utf-8") as outfile: outfile.write(transcript) def download_audio_from_playlist(playlist_url): ydl_opts = { 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'outtmpl': f'{audio_folder}/%(id)s.%(ext)s', 'ignoreerrors': True, # Continue on download errors 'extract_flat': True, # Just get video IDs from the playlist } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(playlist_url, download=False) for video in info_dict['entries']: if video: # Video is not None or deleted process_video(video['id']) def download_transcripts_from_csv(file_path): with open(file_path, 'r') as csvfile: reader = csv.reader(csvfile) for row in tqdm(reader): url = row[0] if "list=" in url: # It's a playlist, process each video in the playlist download_audio_from_playlist(url) else: # It's a single video video_id = url.split('v=')[-1] process_video(video_id) if __name__ == "__main__": # Ensure folders exist os.makedirs(audio_folder, exist_ok=True) os.makedirs(transcripts_folder, exist_ok=True) csv_path = 'urls.csv' download_transcripts_from_csv(csv_path)