| import re |
| from typing import Optional, Dict, Any, List |
| from youtube_transcript_api import YouTubeTranscriptApi |
| from smolagents import Tool |
|
|
|
|
| class YouTubeTranscriptTool(Tool): |
| """ |
| A tool to fetch transcripts from YouTube videos. |
| |
| This tool can extract transcripts in various languages and formats, |
| providing clean text output for further processing by AI agents. |
| """ |
|
|
| name = "youtube_transcript" |
| description = """ |
| Fetches the transcript/captions from a YouTube video. |
| |
| Input: YouTube URL or video ID |
| Output: Clean transcript text with optional timestamps |
| |
| Supports: |
| - Auto-generated and manual captions |
| - Multiple languages |
| - Timestamp formatting options |
| - Text cleaning and formatting |
| """ |
|
|
| inputs = { |
| "video_url": { |
| "type": "string", |
| "description": "YouTube video URL or video ID" |
| }, |
| "language": { |
| "type": "string", |
| "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect", |
| "default": "auto", |
| "nullable": True, |
| }, |
| "include_timestamps": { |
| "type": "boolean", |
| "description": "Whether to include timestamps in the output", |
| "default": False, |
| "nullable": True, |
| }, |
| "clean_text": { |
| "type": "boolean", |
| "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)", |
| "default": True, |
| "nullable": True, |
| } |
| } |
|
|
| output_type = "string" |
|
|
| def __init__(self): |
| super().__init__() |
|
|
| def extract_video_id(self, url: str) -> Optional[str]: |
| """Extract video ID from various YouTube URL formats.""" |
| |
| if len(url) == 11 and url.isalnum(): |
| return url |
|
|
| |
| patterns = [ |
| r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})', |
| r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})', |
| r'youtu\.be\/([a-zA-Z0-9_-]{11})', |
| r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})' |
| ] |
|
|
| for pattern in patterns: |
| match = re.search(pattern, url) |
| if match: |
| return match.group(1) |
|
|
| return None |
|
|
| def clean_transcript_text(self, transcript: List[Dict]) -> str: |
| """Clean and format transcript text.""" |
| text_parts = [] |
|
|
| for entry in transcript: |
| text = entry['text'] |
| |
| text = re.sub(r'\s+', ' ', text.strip()) |
| |
| text = re.sub(r'\[.*?\]', '', text) |
| text = re.sub(r'\(.*?\)', '', text) |
| if text: |
| text_parts.append(text) |
|
|
| |
| full_text = ' '.join(text_parts) |
| |
| full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text) |
| full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text) |
|
|
| return full_text.strip() |
|
|
| def format_with_timestamps(self, transcript: List[Dict]) -> str: |
| """Format transcript with timestamps.""" |
| formatted_parts = [] |
|
|
| for entry in transcript: |
| start_time = entry['start'] |
| minutes = int(start_time // 60) |
| seconds = int(start_time % 60) |
| timestamp = f"[{minutes:02d}:{seconds:02d}]" |
|
|
| text = entry['text'].strip() |
| if text: |
| formatted_parts.append(f"{timestamp} {text}") |
|
|
| return '\n'.join(formatted_parts) |
|
|
| def get_available_languages(self, video_id: str) -> List[str]: |
| """Get list of available transcript languages for a video.""" |
| try: |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) |
| languages = [] |
|
|
| for transcript in transcript_list: |
| languages.append(transcript.language_code) |
|
|
| return languages |
| except Exception: |
| return [] |
|
|
| def forward(self, video_url: str, language: str = "auto", |
| include_timestamps: bool = False, clean_text: bool = True) -> str: |
| """ |
| Fetch and format YouTube video transcript. |
| |
| Args: |
| video_url: YouTube URL or video ID |
| language: Language code for transcript (default: auto-detect) |
| include_timestamps: Whether to include timestamps |
| clean_text: Whether to clean and format the text |
| |
| Returns: |
| Formatted transcript text |
| """ |
| try: |
| |
| video_id = self.extract_video_id(video_url) |
| if not video_id: |
| return "Error: Invalid YouTube URL or video ID provided." |
|
|
| |
| if language == "auto": |
| available_languages = self.get_available_languages(video_id) |
| if not available_languages: |
| return "Error: No transcripts available for this video." |
|
|
| |
| language = 'en' if 'en' in available_languages else available_languages[0] |
|
|
| |
| try: |
| transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) |
| except Exception as e: |
| |
| try: |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) |
| transcript = transcript_list.find_generated_transcript(['en']).fetch() |
| except Exception: |
| try: |
| |
| transcript = transcript_list.find_manually_created_transcript( |
| ['en']).fetch() |
| except Exception: |
| return f"Error: Could not fetch transcript. {str(e)}" |
|
|
| if not transcript: |
| return "Error: No transcript content found." |
|
|
| |
| if include_timestamps: |
| result = self.format_with_timestamps(transcript) |
| else: |
| if clean_text: |
| result = self.clean_transcript_text(transcript) |
| else: |
| result = ' '.join([entry['text'] for entry in transcript]) |
|
|
| |
| metadata = f"YouTube Video ID: {video_id}\n" |
| metadata += f"Language: {language}\n" |
| metadata += f"Transcript Length: {len(result)} characters\n" |
| metadata += "-" * 50 + "\n\n" |
|
|
| return metadata + result |
|
|
| except Exception as e: |
| return f"Error fetching transcript: {str(e)}" |
|
|
|
|
| |
| if __name__ == "__main__": |
| |
| transcript_tool = YouTubeTranscriptTool() |
|
|
| |
| test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" |
|
|
| print("Testing YouTube Transcript Tool...") |
| print("=" * 50) |
|
|
| |
| result = transcript_tool.forward(test_url) |
| print("Basic transcript:") |
| print(result[:500] + "..." if len(result) > 500 else result) |
| print("\n" + "=" * 50 + "\n") |
|
|
| |
| result_with_timestamps = transcript_tool.forward( |
| test_url, |
| include_timestamps=True |
| ) |
| print("With timestamps:") |
| print(result_with_timestamps[:500] + "..." if len( |
| result_with_timestamps) > 500 else result_with_timestamps) |
|
|
| |
| |