Agents_Course_Final_Assignment

Sleeping

App Files Files Community

Agents_Course_Final_Assignment / youtube_utils.py

vlapparov

Update youtube_utils.py

9ffb7e0 verified 10 months ago

raw

history blame contribute delete

7.9 kB

	import re
	from typing import Optional, Dict, Any, List
	from youtube_transcript_api import YouTubeTranscriptApi
	from smolagents import Tool


	class YouTubeTranscriptTool(Tool):
	"""
	A tool to fetch transcripts from YouTube videos.

	This tool can extract transcripts in various languages and formats,
	providing clean text output for further processing by AI agents.
	"""

	name = "youtube_transcript"
	description = """
	Fetches the transcript/captions from a YouTube video.

	Input: YouTube URL or video ID
	Output: Clean transcript text with optional timestamps

	Supports:
	- Auto-generated and manual captions
	- Multiple languages
	- Timestamp formatting options
	- Text cleaning and formatting
	"""

	inputs = {
	"video_url": {
	"type": "string",
	"description": "YouTube video URL or video ID"
	},
	"language": {
	"type": "string",
	"description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
	"default": "auto",
	"nullable": True,
	},
	"include_timestamps": {
	"type": "boolean",
	"description": "Whether to include timestamps in the output",
	"default": False,
	"nullable": True,
	},
	"clean_text": {
	"type": "boolean",
	"description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
	"default": True,
	"nullable": True,
	}
	}

	output_type = "string"

	def __init__(self):
	super().__init__()

	def extract_video_id(self, url: str) -> Optional[str]:
	"""Extract video ID from various YouTube URL formats."""
	# Handle direct video ID
	if len(url) == 11 and url.isalnum():
	return url

	# Regular expression patterns for different YouTube URL formats
	patterns = [
	r'(?:youtube\.com\/watch\?v=\|youtu\.be\/\|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
	r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
	r'youtu\.be\/([a-zA-Z0-9_-]{11})',
	r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)

	return None

	def clean_transcript_text(self, transcript: List[Dict]) -> str:
	"""Clean and format transcript text."""
	text_parts = []

	for entry in transcript:
	text = entry['text']
	# Remove extra spaces and newlines
	text = re.sub(r'\s+', ' ', text.strip())
	# Fix common caption artifacts
	text = re.sub(r'\[.*?\]', '', text) # Remove [Music], [Applause], etc.
	text = re.sub(r'\(.*?\)', '', text) # Remove (inaudible), etc.
	if text:
	text_parts.append(text)

	# Join and clean up the full text
	full_text = ' '.join(text_parts)
	# Fix punctuation spacing
	full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
	full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)

	return full_text.strip()

	def format_with_timestamps(self, transcript: List[Dict]) -> str:
	"""Format transcript with timestamps."""
	formatted_parts = []

	for entry in transcript:
	start_time = entry['start']
	minutes = int(start_time // 60)
	seconds = int(start_time % 60)
	timestamp = f"[{minutes:02d}:{seconds:02d}]"

	text = entry['text'].strip()
	if text:
	formatted_parts.append(f"{timestamp} {text}")

	return '\n'.join(formatted_parts)

	def get_available_languages(self, video_id: str) -> List[str]:
	"""Get list of available transcript languages for a video."""
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	languages = []

	for transcript in transcript_list:
	languages.append(transcript.language_code)

	return languages
	except Exception:
	return []

	def forward(self, video_url: str, language: str = "auto",
	include_timestamps: bool = False, clean_text: bool = True) -> str:
	"""
	Fetch and format YouTube video transcript.

	Args:
	video_url: YouTube URL or video ID
	language: Language code for transcript (default: auto-detect)
	include_timestamps: Whether to include timestamps
	clean_text: Whether to clean and format the text

	Returns:
	Formatted transcript text
	"""
	try:
	# Extract video ID
	video_id = self.extract_video_id(video_url)
	if not video_id:
	return "Error: Invalid YouTube URL or video ID provided."

	# Get available languages if auto-detect is requested
	if language == "auto":
	available_languages = self.get_available_languages(video_id)
	if not available_languages:
	return "Error: No transcripts available for this video."

	# Prefer English, then first available
	language = 'en' if 'en' in available_languages else available_languages[0]

	# Fetch transcript
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
	except Exception as e:
	# Try to get any available transcript
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	transcript = transcript_list.find_generated_transcript(['en']).fetch()
	except Exception:
	try:
	# Try manual transcripts
	transcript = transcript_list.find_manually_created_transcript(
	['en']).fetch()
	except Exception:
	return f"Error: Could not fetch transcript. {str(e)}"

	if not transcript:
	return "Error: No transcript content found."

	# Format output based on options
	if include_timestamps:
	result = self.format_with_timestamps(transcript)
	else:
	if clean_text:
	result = self.clean_transcript_text(transcript)
	else:
	result = ' '.join([entry['text'] for entry in transcript])

	# Add metadata
	metadata = f"YouTube Video ID: {video_id}\n"
	metadata += f"Language: {language}\n"
	metadata += f"Transcript Length: {len(result)} characters\n"
	metadata += "-" * 50 + "\n\n"

	return metadata + result

	except Exception as e:
	return f"Error fetching transcript: {str(e)}"


	# Example usage and testing
	if __name__ == "__main__":
	# Initialize the tool
	transcript_tool = YouTubeTranscriptTool()

	# Test with a sample video
	test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

	print("Testing YouTube Transcript Tool...")
	print("=" * 50)

	# Test basic transcript
	result = transcript_tool.forward(test_url)
	print("Basic transcript:")
	print(result[:500] + "..." if len(result) > 500 else result)
	print("\n" + "=" * 50 + "\n")

	# Test with timestamps
	result_with_timestamps = transcript_tool.forward(
	test_url,
	include_timestamps=True
	)
	print("With timestamps:")
	print(result_with_timestamps[:500] + "..." if len(
	result_with_timestamps) > 500 else result_with_timestamps)

	# Installation requirements:
	# pip install youtube-transcript-api smolagents