Harshit commited on
Commit
90ddd8e
1 Parent(s): 24f4d92

first commit

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ # # Run the pip install command
3
+ subprocess.check_call(['pip', 'install', 'wordcloud'])
4
+ subprocess.check_call(['pip', 'install', 'git+https://github.com/openai/whisper.git'])
5
+ subprocess.check_call(['pip', 'install', 'transformers'])
6
+ subprocess.check_call(['pip', 'install', 'imageio==2.4.1'])
7
+ subprocess.check_call(['pip', 'install', 'moviepy'])
8
+ subprocess.check_call(['pip', 'install', 'keybert'])
9
+
10
+ subprocess.check_call(['pip', 'install', 'pytube'])
11
+
12
+ import streamlit as st
13
+ import os
14
+ from wordcloud import WordCloud
15
+ from keybert import KeyBERT
16
+ import pandas as pd
17
+ import matplotlib.pyplot as plt
18
+ # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
19
+
20
+
21
+ from moviepy.editor import *
22
+ from tqdm import tqdm
23
+ import os
24
+ import math
25
+ import nltk
26
+ nltk.download('punkt')
27
+ import whisper
28
+ from transformers import pipeline
29
+
30
+ from pytube import YouTube
31
+ def process_video(path):
32
+ whisper_model = whisper.load_model("base")
33
+
34
+ def SpeechToTextEng(aud_path):
35
+ result = whisper_model.transcribe(aud_path)
36
+ return result["text"]
37
+
38
+ def run_range(duration):
39
+ time=duration/60
40
+ floor=math.ceil(time)
41
+ return floor
42
+
43
+ time_range=60
44
+ clip_run_range=0
45
+ clip_duration=0
46
+
47
+ def audio_generator(path,aud=0,vid=0):
48
+ if vid==1:
49
+ clip=VideoFileClip(path)
50
+ clip_duration = clip.duration
51
+ clip_run_range=run_range(clip_duration)
52
+ for i in range(clip_run_range):
53
+ left=i*time_range
54
+ right=left+time_range
55
+ # print(left,right)
56
+
57
+ crop_clip=clip.subclip(left,right)
58
+ try:
59
+ crop_clip.audio.write_audiofile("vid_to_aud"+str(i)+".mp3")
60
+ except:
61
+ pass
62
+
63
+ if aud==1:
64
+ audio_clip=AudioFileClip(path)
65
+ clip_duration = audio_clip.duration
66
+ print(clip_duration)
67
+ clip_run_range=run_range(clip_duration)
68
+ print(clip_run_range)
69
+ for i in range(clip_run_range):
70
+ left=i*time_range
71
+ right=left+time_range
72
+ # print(left,right)
73
+ crop_clip=audio_clip.subclip(left,right)
74
+ try:
75
+ crop_clip.write_audiofile("vid_to_aud"+str(i)+".mp3")
76
+ except:
77
+ pass
78
+
79
+
80
+
81
+
82
+ # YouTube video URL
83
+ video_url = path
84
+
85
+ # Create a YouTube object
86
+ yt = YouTube(video_url)
87
+
88
+ # Get the highest resolution video stream
89
+ stream = yt.streams.get_lowest_resolution()
90
+
91
+ # Download the video
92
+ stream.download(filename='meeting.mp4')
93
+
94
+ audio_generator("./meeting.mp4",vid=1)
95
+ transcribed_lit=[]
96
+ label_lit=[]
97
+ translated_lit=[]
98
+
99
+ for i in tqdm(range(clip_run_range)):
100
+ transcribed=SpeechToTextEng("./vid_to_aud"+str(i)+".mp3")
101
+ transcribed_lit.append(transcribed)
102
+ os.remove("./vid_to_aud"+str(i)+".mp3")
103
+
104
+
105
+ data = pd.DataFrame(
106
+ {'transcriptions': transcribed_lit
107
+ })
108
+
109
+ summarizer = pipeline("summarization")
110
+
111
+ sentiment_analyzer = pipeline("sentiment-analysis")
112
+
113
+ sumarized_lit=[]
114
+ sentiment_lit=[]
115
+ for i in tqdm(range(len(data))):
116
+ summarized=summarizer(data.iloc[i,0],min_length=75, max_length=300)[0]['summary_text']
117
+ sentiment = sentiment_analyzer(data.iloc[i,0])[0]['label']
118
+ sumarized_lit.append(summarized)
119
+ sentiment_lit.append(sentiment)
120
+
121
+ data['summary']=sumarized_lit
122
+ data['sentiment']=sentiment_lit
123
+ data.to_csv('output2.csv', index=False)
124
+ tot_text=""
125
+ for i in range(len(data)):
126
+ tot_text=tot_text+data.iloc[i,0]
127
+
128
+ key_model = KeyBERT('distilbert-base-nli-mean-tokens')
129
+ def extract_keywords(text, top_n=50):
130
+ keywords = key_model.extract_keywords(text, top_n=top_n)
131
+ return [keyword[0] for keyword in keywords]
132
+
133
+ tot_keywords=extract_keywords(tot_text)
134
+
135
+ def get_500_words(text,left,right):
136
+ words = text.split()
137
+ first_500_words = ' '.join(words[left:right])
138
+ return first_500_words
139
+
140
+ def summarize_text(text):
141
+ chunk_size = 500 # Number of words per chunk
142
+ total_summary = "" # Total summary
143
+
144
+ words = text.split() # Split the text into individual words
145
+ num_chunks = len(words) // chunk_size + 1 # Calculate the number of chunks
146
+
147
+ for i in tqdm(range(num_chunks)):
148
+ start_index = i * chunk_size
149
+ end_index = start_index + chunk_size
150
+ chunk = " ".join(words[start_index:end_index])
151
+
152
+ # Pass the chunk to the summarizer (replace with your summarization code)
153
+ chunk_summary = summarizer(chunk,min_length=75, max_length=200)[0]['summary_text']
154
+ # print(chunk_summary)
155
+ total_summary += chunk_summary
156
+
157
+ return total_summary
158
+
159
+ tot_summary=summarize_text(tot_text)
160
+ return tot_text,tot_summary,tot_keywords
161
+
162
+
163
+
164
+
165
+ # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
166
+ def generate_word_cloud(text):
167
+ # Create a WordCloud object
168
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
169
+
170
+ # Display the generated word cloud
171
+ fig, ax = plt.subplots(figsize=(10, 5))
172
+
173
+ # Plot the word cloud on the axis
174
+ ax.imshow(wordcloud, interpolation='bilinear')
175
+ ax.axis('off')
176
+ st.pyplot(fig)
177
+
178
+
179
+ def main():
180
+ st.title("Meeting Summary Web App")
181
+
182
+ # YouTube link input
183
+ youtube_url = st.text_input("Enter the YouTube video link")
184
+
185
+ if st.button("Process Video"):
186
+ if youtube_url:
187
+ # Process the YouTube video
188
+ tot_text, tot_summary, tot_keywords = process_video(youtube_url)
189
+
190
+ # Display the output
191
+ if os.path.exists("output2.csv"):
192
+ output_df = pd.read_csv("output2.csv")
193
+ st.subheader("Transcriptions:")
194
+ st.write(output_df["transcriptions"])
195
+
196
+ st.subheader("Labels:")
197
+ st.write(output_df["labels"])
198
+
199
+ st.subheader("Word Cloud:")
200
+ generate_word_cloud(output_df["transcriptions"].str.cat(sep=' '))
201
+
202
+ st.subheader("tot_text:")
203
+ st.write(tot_text)
204
+
205
+ st.subheader("tot_summary:")
206
+ st.write(tot_summary)
207
+
208
+ st.subheader("tot_keywords:")
209
+ st.write(tot_keywords)
210
+
211
+ else:
212
+ st.write("No output file found.")
213
+
214
+ if __name__ == "__main__":
215
+ main()