PirateXX's picture
Update app.py
53fda21 verified
# imports
import os
import requests
from IPython.display import Markdown
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
import re
class YoutubeVideoID:
def __init__(self, url):
self.url = url
self.video_id = self.extract_video_id(url)
def extract_video_id(self, url):
"""
Extracts the YouTube video ID from a given URL.
Supports both regular and shortened URLs.
"""
# Regular expression to match YouTube video URL and extract the video ID
regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
match = re.match(regex, url)
if match:
return match.group(1)
else:
raise ValueError("Invalid YouTube URL")
def __str__(self):
return f"Video ID: {self.video_id}"
def get_transcript(video_id, language='en'):
try:
# Try to get the transcript in the desired language (Indonesian by default)
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
# Join all the 'text' fields into a single string
return " ".join([item['text'] for item in transcript])
except Exception as e:
print(f"Error fetching transcript: {e}")
return None
def summarize_text(text):
try:
text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
output = text_summary(input_text)
return output[0]['summary_text']
except Exception as e:
print(f"Error summarizing text: {e}")
return None
def split_text(text, chunk_size=3000):
"""
Splits large text into smaller chunks based on the given chunk size.
Ensures that chunks end with a full stop where possible to maintain sentence integrity.
:param text: str, the text to be split
:param chunk_size: int, maximum size of each chunk (default 3000 characters)
:return: list of str, where each str is a chunk of text
"""
chunks = []
while len(text) > chunk_size:
# Find the last full stop within or at the chunk size
split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size
if split_point == -1: # No period found within the chunk size
split_point = chunk_size
# Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
# Add the remaining text as the final chunk, only strip if there's content
if text:
chunks.append(text.strip())
return chunks
def get_result(video_url, summarize=True):
# Fetch transcript using the video ID
yt_video = YoutubeVideoID(video_url)
transcript_text = get_transcript(yt_video.video_id)
print(yt_video.video_id)
print(summarize)
if summarize == False:
return transcript_text
transcript_chunks = split_text(transcript_text)
summaries = []
for chunk in transcript_chunks:
summary = summarize_text(chunk)
summaries.append(summary)
full_summary = " ".join(summaries)
return Markdown(full_summary)
def create_gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("# YouTube Video Summarizer")
gr.Markdown("""
This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
### Credits:
Created by **Arsh** – Providing a simple solution for video summarization!
""")
# Input for YouTube URL
video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)
# Radio button for choosing output type (summary or full transcript)
output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")
# Output for summarized or full transcript text
output_text = gr.Textbox(label="Result", lines=6)
# Submit button
submit_button = gr.Button("Generate", variant="primary")
# Define the action for the button press
submit_button.click(fn=get_result,
inputs=[video_url_input, output_type],
outputs=[output_text])
return demo
# Launch the interface with user credit
demo = create_gradio_interface()
demo.launch(share=True, show_api=True)