File size: 4,630 Bytes
3dfb32c
 
 
 
 
 
bb29c0c
3dfb32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53fda21
 
3dfb32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698fffa
3dfb32c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# imports

import os

import requests
from IPython.display import Markdown
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
import re

class YoutubeVideoID:
    def __init__(self, url):
        self.url = url
        self.video_id = self.extract_video_id(url)

    def extract_video_id(self, url):
        """
        Extracts the YouTube video ID from a given URL.
        Supports both regular and shortened URLs.
        """
        # Regular expression to match YouTube video URL and extract the video ID
        regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
        match = re.match(regex, url)
        
        if match:
            return match.group(1)
        else:
            raise ValueError("Invalid YouTube URL")

    def __str__(self):
        return f"Video ID: {self.video_id}"

def get_transcript(video_id, language='en'):
    try:
        # Try to get the transcript in the desired language (Indonesian by default)
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        # Join all the 'text' fields into a single string
        return " ".join([item['text'] for item in transcript])
    except Exception as e:
        print(f"Error fetching transcript: {e}")
        return None



def summarize_text(text):
    try:
        text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
        output = text_summary(input_text)
        return output[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None

def split_text(text, chunk_size=3000):
    """
    Splits large text into smaller chunks based on the given chunk size.
    Ensures that chunks end with a full stop where possible to maintain sentence integrity.
    
    :param text: str, the text to be split
    :param chunk_size: int, maximum size of each chunk (default 3000 characters)
    :return: list of str, where each str is a chunk of text
    """
    chunks = []
    while len(text) > chunk_size:
        # Find the last full stop within or at the chunk size
        split_point = text.rfind('.', 0, chunk_size + 1)  # +1 to include the period itself if it's at chunk_size
        if split_point == -1:  # No period found within the chunk size
            split_point = chunk_size
        
        # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
        chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
        text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
    
    # Add the remaining text as the final chunk, only strip if there's content
    if text:
        chunks.append(text.strip())
    
    return chunks


def get_result(video_url, summarize=True):
    # Fetch transcript using the video ID
    yt_video = YoutubeVideoID(video_url)
    transcript_text = get_transcript(yt_video.video_id)
    print(yt_video.video_id)
    print(summarize)
    if summarize == False:
        return transcript_text
    transcript_chunks = split_text(transcript_text)
    summaries = []
    for chunk in transcript_chunks:
        summary = summarize_text(chunk)
        summaries.append(summary)
    full_summary = " ".join(summaries)
    return Markdown(full_summary)


def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# YouTube Video Summarizer")
        gr.Markdown("""
        This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
        ### Credits:
        Created by **Arsh** – Providing a simple solution for video summarization!
        """)

        # Input for YouTube URL
        video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)

        # Radio button for choosing output type (summary or full transcript)
        output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")

        # Output for summarized or full transcript text
        output_text = gr.Textbox(label="Result", lines=6)

        # Submit button
        submit_button = gr.Button("Generate", variant="primary")

        # Define the action for the button press
        submit_button.click(fn=get_result,
                            inputs=[video_url_input, output_type],
                            outputs=[output_text])

    return demo

# Launch the interface with user credit
demo = create_gradio_interface()
demo.launch(share=True, show_api=True)