# imports import os import requests from dotenv import load_dotenv from IPython.display import Markdown from openai import OpenAI from youtube_transcript_api import YouTubeTranscriptApi import re openai = OpenAI() class YoutubeVideoID: def __init__(self, url): self.url = url self.video_id = self.extract_video_id(url) def extract_video_id(self, url): """ Extracts the YouTube video ID from a given URL. Supports both regular and shortened URLs. """ # Regular expression to match YouTube video URL and extract the video ID regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})" match = re.match(regex, url) if match: return match.group(1) else: raise ValueError("Invalid YouTube URL") def __str__(self): return f"Video ID: {self.video_id}" def get_transcript(video_id, language='en'): try: # Try to get the transcript in the desired language (Indonesian by default) transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) # Join all the 'text' fields into a single string return " ".join([item['text'] for item in transcript]) except Exception as e: print(f"Error fetching transcript: {e}") return None def summarize_text(text): try: text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16) output = text_summary(input_text) return output[0]['summary_text'] except Exception as e: print(f"Error summarizing text: {e}") return None def split_text(text, chunk_size=3000): """ Splits large text into smaller chunks based on the given chunk size. Ensures that chunks end with a full stop where possible to maintain sentence integrity. :param text: str, the text to be split :param chunk_size: int, maximum size of each chunk (default 3000 characters) :return: list of str, where each str is a chunk of text """ chunks = [] while len(text) > chunk_size: # Find the last full stop within or at the chunk size split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size if split_point == -1: # No period found within the chunk size split_point = chunk_size # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size]) text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:] # Add the remaining text as the final chunk, only strip if there's content if text: chunks.append(text.strip()) return chunks def get_result(video_url, summarize=True): # Fetch transcript using the video ID yt_video = YoutubeVideoID(video_url) transcript_text = get_transcript(yt_video.video_id) if summarize == False: return transcript_text transcript_chunks = split_text(transcript_text) summaries = [] for chunk in transcript_chunks: summary = summarize_text(chunk) summaries.append(summary) full_summary = " ".join(summaries) return Markdown(full_summary) def create_gradio_interface(): with gr.Blocks() as demo: gr.Markdown("# YouTube Video Summarizer") gr.Markdown(""" This space provides summary of youtube video urls, you can also get full transcripts if you choose so. ### Credits: Created by **Arsh** – Providing a simple solution for video summarization! """) # Input for YouTube URL video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1) # Radio button for choosing output type (summary or full transcript) output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary") # Output for summarized or full transcript text output_text = gr.Textbox(label="Result", lines=6) # Submit button submit_button = gr.Button("Generate", variant="primary") # Define the action for the button press submit_button.click(fn=get_youtube_transcript, inputs=[video_url_input, output_type], outputs=[output_text]) return demo # Launch the interface with user credit demo = create_gradio_interface() demo.launch(share=True, show_api=True)