File size: 5,893 Bytes
8b6bb87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import streamlit as st
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
import re


def fetch_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split("v=")[1]
        # Fetch the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Process the transcript data
        text_transcript = "\n".join([entry['text'] for entry in transcript])
        return text_transcript
    except Exception as e:
        return str(e)

def clean_transcript(transcript):
    # Remove non-speech elements (e.g., laughter, background noises)
    transcript = re.sub(r'\[.*?\]', '', transcript)

    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
    # Example:
    # import nltk
    # transcript = ' '.join(nltk.word_tokenize(transcript))

    # Normalize punctuation and formatting
    transcript = transcript.replace('\n', ' ')  # Remove line breaks
    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces

    # Remove timestamps and annotations
    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)

    # Handle speaker identification (if present)
    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)

    # Remove filler words and phrases
    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
    for word in filler_words:
        transcript = transcript.replace(word, '')
    
    # Replace common contractions with their expanded forms
    transcript = transcript.replace("won't", "will not")
    transcript = transcript.replace("can't", "cannot")
    transcript = transcript.replace("n't", " not")
    transcript = transcript.replace("'ll", " will")
    transcript = transcript.replace("'ve", " have")
    transcript = transcript.replace("'re", " are")
    transcript = transcript.replace("'d", " would")
    transcript = transcript.replace("'s", " is")

    return transcript.strip()  # Trim leading/trailing whitespaces

def extract_video_id(url):
    """Extracts the YouTube video ID from the URL."""
    match = re.search(r"(?<=v=)[\w-]+", url)
    if match:
        return match.group(0)
    else:
        return None


def summarize_transcript(text, llama_pipeline):
    def summarize_text(llama_pipeline, system_prompt, text):
        # Format the input text with special tokens for the model
        text = f"""
        <s>[INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        {text}[/INST]
        """
        # Generate sequences using the pipeline with specified parameters
        sequences = llama_pipeline(text)
        # Extract the generated text from the sequences
        generated_text = sequences[0]["generated_text"]
        # Trim the generated text to remove the instruction part
        generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
        # Return the processed generated text
        return generated_text
    # Define the maximum input length for each iteration of summarization
    input_len = 1000
    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
        if len(summary) < input_len:
            return summary
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

# Load the model and tokenizer
@st.cache_resource()
def load_model():
    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pipeline_llama2 = pipeline(
        "text-generation", #task
        model=model_name,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=max_token_length,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )
    return pipeline_llama2

def main():
    st.title("YouTube Video Preview")

    with st.spinner('Loading checkpoint shards of LLAMA-2'):
        pipeline_llama2 = load_model()
    st.success('Done!')

    # Input field for the YouTube video link
    youtube_url = st.text_input("Paste YouTube Video Link:")
    
    # Extract video ID from the URL
    video_id = extract_video_id(youtube_url)

    # Display video preview if video ID is found
    if video_id:
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        st.video(video_url, format='video/mp4')
        video_transcript = clean_transcript(fetch_transcript(video_url))
        if video_transcript:
            # Display transcript and summary side by side
            col1, col2 = st.columns(2)
            with col1:
                st.subheader("Transcript:")
                st.text_area(" ", video_transcript, height=400)

            with col2:
                st.subheader("Summary:")
                video_summary = summarize_transcript(video_transcript, pipeline_llama2)
                st.text_area(" ", video_summary, height=400)
                print(f"Summary:{video_summary}")   
        else:
            st.error("Failed to fetch video transcript. Please check the video ID or try again later.")

    elif youtube_url:
        st.warning("Invalid YouTube Video Link")

if __name__ == "__main__":
    main()