from youtube_transcript_api import YouTubeTranscriptApi from transformers import pipeline, AutoTokenizer import torch import re import gradio as gr import requests import os # ------------------- # CONFIGURATION # ------------------- model_path = "facebook/bart-large-cnn" max_tokens = 1024 YOUTUBE_API_KEY = "AIzaSyDCY9RM085oTxC8oms5z9TPzPKXKLFQAgc" # Replace with your real API key # ------------------- # Load model # ------------------- tokenizer = AutoTokenizer.from_pretrained(model_path) text_summary = pipeline( task="summarization", model=model_path, tokenizer=tokenizer, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32 ) from pydantic import BaseModel, PydanticUserError, ConfigDict from pydantic import BaseModel, ConfigDict class MyModel(BaseModel): request: 'starlette.requests.Request' model_config = ConfigDict(arbitrary_types_allowed=True) from pydantic_core import core_schema from starlette.requests import Request def get_pydantic_core_schema(request_type, handler): return core_schema.any_schema() Request.__get_pydantic_core_schema__ = get_pydantic_core_schema # ------------------- # Utilities # ------------------- def extract_video_id(url): regex = r"(?:v=|\/)([0-9A-Za-z_-]{11})" match = re.search(regex, url) if match: return match.group(1) else: raise ValueError("Invalid YouTube URL") def fetch_video_metadata(video_id): try: url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={YOUTUBE_API_KEY}" response = requests.get(url) data = response.json() if "items" in data and data["items"]: title = data["items"][0]["snippet"]["title"] description = data["items"][0]["snippet"]["description"] return title, description else: return "Title Not Found", "Description Not Found" except Exception as e: return "Error fetching title", str(e) def get_transcript_text(video_id): try: transcript = YouTubeTranscriptApi.get_transcript(video_id) full_text = " ".join([entry['text'] for entry in transcript]) return full_text except Exception as e: return None def split_into_chunks(text, tokenizer, max_tokens): words = text.split() chunks = [] current_chunk = [] current_len = 0 for word in words: token_len = len(tokenizer.tokenize(word)) if current_len + token_len > max_tokens: chunks.append(" ".join(current_chunk)) current_chunk = [word] current_len = token_len else: current_chunk.append(word) current_len += token_len if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def summarize_text(full_text): if not full_text.strip(): return "Transcript is empty or could not be retrieved." chunks = split_into_chunks(full_text, tokenizer, max_tokens) summaries = [] for i, chunk in enumerate(chunks): try: print(f"Summarizing chunk {i+1}/{len(chunks)}...") summary = text_summary(chunk, max_length=180, min_length=10, do_sample=False) summaries.append(summary[0]['summary_text']) except Exception as e: summaries.append(f"[Error summarizing chunk: {str(e)}]") return "\n\n".join(summaries) # ------------------- # Main Summary Function # ------------------- def summarize_youtube_video(url): try: video_id = extract_video_id(url) title, description = fetch_video_metadata(video_id) transcript = get_transcript_text(video_id) if not transcript: return f"**Title**: {title}\n\n**Transcript not available.**" summary = summarize_text(transcript) return f"**Title**: {title}\n\n**Summary**:\n{summary}" except Exception as e: return f"Error: {str(e)}" # ------------------- # Gradio UI # ------------------- demo = gr.Interface( fn=summarize_youtube_video, inputs=[gr.Textbox(label='Enter YouTube URL')], outputs=[gr.Textbox(label='Video Title and Summary', lines=15)], title='YouTube Video Summarizer with Metadata', description='Paste a YouTube video URL to get a title and summarized content using transcript + YouTube API' ) if __name__ == "__main__": demo.launch()