from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable from deep_translator import GoogleTranslator from langdetect import detect import logging import re import time from concurrent.futures import ThreadPoolExecutor import random def extract_video_id(url): """ Extracts the video ID from a YouTube URL. Supports various formats. """ patterns = [ r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", r"youtu\.be\/([0-9A-Za-z_-]{11})" ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) raise ValueError("Invalid YouTube URL format. Please provide a valid YouTube URL.") def format_transcript(transcript_list): """ Formats the transcript list into a single text string. """ if isinstance(transcript_list, list): return " ".join([item['text'] for item in transcript_list]) else: # Handle FetchedTranscript object return " ".join([snippet.text for snippet in transcript_list]) def translate_text_sync(text, dest='en'): """ Synchronous translation function to avoid event loop issues. """ try: source_lang = detect(text) if source_lang == dest: return text translator = GoogleTranslator(source=source_lang, target=dest) return translator.translate(text) except Exception as e: logging.error(f"Translation error: {str(e)}") return text def get_youtube_transcript(video_url, max_retries=3): """ Fetches transcript from YouTube video URL. Tries multiple languages and translation methods to ensure success. Includes retry logic for transient errors. """ if not video_url or not isinstance(video_url, str): return "Error: Please provide a valid YouTube URL" for retry in range(max_retries): try: # Extract video ID video_id = extract_video_id(video_url) logging.info(f"Processing video ID: {video_id}") # Add a small delay between retries to avoid rate limiting if retry > 0: time.sleep(random.uniform(1, 3)) try: # Try the simplest approach first transcript_data = YouTubeTranscriptApi.get_transcript(video_id) return format_transcript(transcript_data) except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable): # If simple approach fails, try more complex methods try: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # Try to get English transcript first (manual or auto-generated) for lang_code in ['en', 'en-US', 'en-GB']: try: transcript = transcript_list.find_transcript([lang_code]) transcript_data = transcript.fetch() return format_transcript(transcript_data) except: continue # Try any available language and translate for transcript in transcript_list: try: transcript_data = transcript.fetch() raw_transcript = format_transcript(transcript_data) # If it's already English, return it if transcript.language_code.startswith('en'): return raw_transcript # Translate to English return translate_text_sync(raw_transcript, 'en') except Exception as inner_e: logging.warning(f"Failed with transcript {transcript.language_code}: {str(inner_e)}") continue # If we get here, no transcripts worked return "Error: No available transcripts found for this video. Please try a different video or provide example posts instead." except Exception as e: # If we can't even list transcripts, it's disabled return "Error: Subtitles are disabled for this video. Please try a different video or provide example posts instead." except ValueError as e: # URL parsing error return f"Error: {str(e)}" except Exception as e: # General error with retries if retry < max_retries - 1: logging.warning(f"Retry {retry+1}/{max_retries} due to: {str(e)}") continue else: return f"Error: Failed to fetch transcript after {max_retries} attempts. Please try a different video or provide example posts." # Fallback message if all retries fail return "Error: Unable to process this YouTube video. Please try a different video or provide example posts instead."