frknayk commited on
Commit
8b6bb87
1 Parent(s): 148d1c2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # Transcript
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ import os
5
+ # Summarization
6
+ from transformers import (
7
+ pipeline,
8
+ AutoModelForSpeechSeq2Seq,
9
+ AutoProcessor,
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ )
14
+ import torch
15
+ import re
16
+
17
+
18
+ def fetch_transcript(video_url):
19
+ try:
20
+ # Extract the video ID from the URL
21
+ video_id = video_url.split("v=")[1]
22
+ # Fetch the transcript for the video
23
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
24
+ # Process the transcript data
25
+ text_transcript = "\n".join([entry['text'] for entry in transcript])
26
+ return text_transcript
27
+ except Exception as e:
28
+ return str(e)
29
+
30
+ def clean_transcript(transcript):
31
+ # Remove non-speech elements (e.g., laughter, background noises)
32
+ transcript = re.sub(r'\[.*?\]', '', transcript)
33
+
34
+ # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
35
+ # Example:
36
+ # import nltk
37
+ # transcript = ' '.join(nltk.word_tokenize(transcript))
38
+
39
+ # Normalize punctuation and formatting
40
+ transcript = transcript.replace('\n', ' ') # Remove line breaks
41
+ transcript = re.sub(r'\s+', ' ', transcript) # Remove extra whitespaces
42
+
43
+ # Remove timestamps and annotations
44
+ transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)
45
+
46
+ # Handle speaker identification (if present)
47
+ # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)
48
+
49
+ # Remove filler words and phrases
50
+ filler_words = ['like', 'you know', 'sort of'] # Add more as needed
51
+ for word in filler_words:
52
+ transcript = transcript.replace(word, '')
53
+
54
+ # Replace common contractions with their expanded forms
55
+ transcript = transcript.replace("won't", "will not")
56
+ transcript = transcript.replace("can't", "cannot")
57
+ transcript = transcript.replace("n't", " not")
58
+ transcript = transcript.replace("'ll", " will")
59
+ transcript = transcript.replace("'ve", " have")
60
+ transcript = transcript.replace("'re", " are")
61
+ transcript = transcript.replace("'d", " would")
62
+ transcript = transcript.replace("'s", " is")
63
+
64
+ return transcript.strip() # Trim leading/trailing whitespaces
65
+
66
+ def extract_video_id(url):
67
+ """Extracts the YouTube video ID from the URL."""
68
+ match = re.search(r"(?<=v=)[\w-]+", url)
69
+ if match:
70
+ return match.group(0)
71
+ else:
72
+ return None
73
+
74
+
75
+ def summarize_transcript(text, llama_pipeline):
76
+ def summarize_text(llama_pipeline, system_prompt, text):
77
+ # Format the input text with special tokens for the model
78
+ text = f"""
79
+ <s>[INST] <<SYS>>
80
+ {system_prompt}
81
+ <</SYS>>
82
+ {text}[/INST]
83
+ """
84
+ # Generate sequences using the pipeline with specified parameters
85
+ sequences = llama_pipeline(text)
86
+ # Extract the generated text from the sequences
87
+ generated_text = sequences[0]["generated_text"]
88
+ # Trim the generated text to remove the instruction part
89
+ generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
90
+ # Return the processed generated text
91
+ return generated_text
92
+ # Define the maximum input length for each iteration of summarization
93
+ input_len = 1000
94
+ # Start an infinite loop to repeatedly summarize the text
95
+ while True:
96
+ # Print the current length of the text
97
+ print(len(text))
98
+ # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
99
+ summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
100
+ if len(summary) < input_len:
101
+ return summary
102
+ # Concatenate the current summary with the remaining part of the text for the next iteration
103
+ text = summary + " " + text[input_len:]
104
+
105
+ # Load the model and tokenizer
106
+ @st.cache_resource()
107
+ def load_model():
108
+ # Define the model name to be used for the chat function
109
+ model_name = "meta-llama/Llama-2-7b-chat-hf"
110
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
111
+ pipeline_llama2 = pipeline(
112
+ "text-generation", #task
113
+ model=model_name,
114
+ tokenizer=tokenizer,
115
+ torch_dtype=torch.bfloat16,
116
+ trust_remote_code=True,
117
+ device_map="auto",
118
+ # max_length=max_token_length,
119
+ do_sample=True,
120
+ top_k=10,
121
+ num_return_sequences=1,
122
+ eos_token_id=tokenizer.eos_token_id
123
+ )
124
+ return pipeline_llama2
125
+
126
+ def main():
127
+ st.title("YouTube Video Preview")
128
+
129
+ with st.spinner('Loading checkpoint shards of LLAMA-2'):
130
+ pipeline_llama2 = load_model()
131
+ st.success('Done!')
132
+
133
+ # Input field for the YouTube video link
134
+ youtube_url = st.text_input("Paste YouTube Video Link:")
135
+
136
+ # Extract video ID from the URL
137
+ video_id = extract_video_id(youtube_url)
138
+
139
+ # Display video preview if video ID is found
140
+ if video_id:
141
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
142
+ st.video(video_url, format='video/mp4')
143
+ video_transcript = clean_transcript(fetch_transcript(video_url))
144
+ if video_transcript:
145
+ # Display transcript and summary side by side
146
+ col1, col2 = st.columns(2)
147
+ with col1:
148
+ st.subheader("Transcript:")
149
+ st.text_area(" ", video_transcript, height=400)
150
+
151
+ with col2:
152
+ st.subheader("Summary:")
153
+ video_summary = summarize_transcript(video_transcript, pipeline_llama2)
154
+ st.text_area(" ", video_summary, height=400)
155
+ print(f"Summary:{video_summary}")
156
+ else:
157
+ st.error("Failed to fetch video transcript. Please check the video ID or try again later.")
158
+
159
+ elif youtube_url:
160
+ st.warning("Invalid YouTube Video Link")
161
+
162
+ if __name__ == "__main__":
163
+ main()