Harshit commited on
Commit
38b55ce
1 Parent(s): 136a1ef
Files changed (1) hide show
  1. app.py +2 -212
app.py CHANGED
@@ -1,214 +1,4 @@
1
- import subprocess
2
- # # Run the pip install command
3
- subprocess.check_call(['pip', 'install', 'wordcloud'])
4
- subprocess.check_call(['pip', 'install', 'git+https://github.com/openai/whisper.git'])
5
- subprocess.check_call(['pip', 'install', 'transformers'])
6
- subprocess.check_call(['pip', 'install', 'imageio==2.4.1'])
7
- subprocess.check_call(['pip', 'install', 'moviepy'])
8
- subprocess.check_call(['pip', 'install', 'keybert'])
9
- subprocess.check_call(['pip', 'install', 'pytube'])
10
-
11
  import streamlit as st
12
- import os
13
- from wordcloud import WordCloud
14
- from keybert import KeyBERT
15
- import pandas as pd
16
- import matplotlib.pyplot as plt
17
- # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
18
-
19
-
20
- from moviepy.editor import *
21
- from tqdm import tqdm
22
- import os
23
- import math
24
- import nltk
25
- nltk.download('punkt')
26
- import whisper
27
- from transformers import pipeline
28
-
29
- from pytube import YouTube
30
- def process_video(path):
31
- whisper_model = whisper.load_model("base")
32
-
33
- def SpeechToTextEng(aud_path):
34
- result = whisper_model.transcribe(aud_path)
35
- return result["text"]
36
-
37
- def run_range(duration):
38
- time=duration/60
39
- floor=math.ceil(time)
40
- return floor
41
-
42
- time_range=60
43
- clip_run_range=0
44
- clip_duration=0
45
-
46
- def audio_generator(path,aud=0,vid=0):
47
- if vid==1:
48
- clip=VideoFileClip(path)
49
- clip_duration = clip.duration
50
- clip_run_range=run_range(clip_duration)
51
- for i in range(clip_run_range):
52
- left=i*time_range
53
- right=left+time_range
54
- # print(left,right)
55
-
56
- crop_clip=clip.subclip(left,right)
57
- try:
58
- crop_clip.audio.write_audiofile("vid_to_aud"+str(i)+".mp3")
59
- except:
60
- pass
61
-
62
- if aud==1:
63
- audio_clip=AudioFileClip(path)
64
- clip_duration = audio_clip.duration
65
- print(clip_duration)
66
- clip_run_range=run_range(clip_duration)
67
- print(clip_run_range)
68
- for i in range(clip_run_range):
69
- left=i*time_range
70
- right=left+time_range
71
- # print(left,right)
72
- crop_clip=audio_clip.subclip(left,right)
73
- try:
74
- crop_clip.write_audiofile("vid_to_aud"+str(i)+".mp3")
75
- except:
76
- pass
77
-
78
-
79
-
80
-
81
- # YouTube video URL
82
- video_url = path
83
-
84
- # Create a YouTube object
85
- yt = YouTube(video_url)
86
-
87
- # Get the highest resolution video stream
88
- stream = yt.streams.get_lowest_resolution()
89
-
90
- # Download the video
91
- stream.download(filename='meeting.mp4')
92
-
93
- audio_generator("./meeting.mp4",vid=1)
94
- transcribed_lit=[]
95
- label_lit=[]
96
- translated_lit=[]
97
-
98
- for i in tqdm(range(clip_run_range)):
99
- transcribed=SpeechToTextEng("./vid_to_aud"+str(i)+".mp3")
100
- transcribed_lit.append(transcribed)
101
- os.remove("./vid_to_aud"+str(i)+".mp3")
102
-
103
-
104
- data = pd.DataFrame(
105
- {'transcriptions': transcribed_lit
106
- })
107
-
108
- summarizer = pipeline("summarization")
109
-
110
- sentiment_analyzer = pipeline("sentiment-analysis")
111
-
112
- sumarized_lit=[]
113
- sentiment_lit=[]
114
- for i in tqdm(range(len(data))):
115
- summarized=summarizer(data.iloc[i,0],min_length=75, max_length=300)[0]['summary_text']
116
- sentiment = sentiment_analyzer(data.iloc[i,0])[0]['label']
117
- sumarized_lit.append(summarized)
118
- sentiment_lit.append(sentiment)
119
-
120
- data['summary']=sumarized_lit
121
- data['sentiment']=sentiment_lit
122
- data.to_csv('output2.csv', index=False)
123
- tot_text=""
124
- for i in range(len(data)):
125
- tot_text=tot_text+data.iloc[i,0]
126
-
127
- key_model = KeyBERT('distilbert-base-nli-mean-tokens')
128
- def extract_keywords(text, top_n=50):
129
- keywords = key_model.extract_keywords(text, top_n=top_n)
130
- return [keyword[0] for keyword in keywords]
131
-
132
- tot_keywords=extract_keywords(tot_text)
133
-
134
- def get_500_words(text,left,right):
135
- words = text.split()
136
- first_500_words = ' '.join(words[left:right])
137
- return first_500_words
138
-
139
- def summarize_text(text):
140
- chunk_size = 500 # Number of words per chunk
141
- total_summary = "" # Total summary
142
-
143
- words = text.split() # Split the text into individual words
144
- num_chunks = len(words) // chunk_size + 1 # Calculate the number of chunks
145
-
146
- for i in tqdm(range(num_chunks)):
147
- start_index = i * chunk_size
148
- end_index = start_index + chunk_size
149
- chunk = " ".join(words[start_index:end_index])
150
-
151
- # Pass the chunk to the summarizer (replace with your summarization code)
152
- chunk_summary = summarizer(chunk,min_length=75, max_length=200)[0]['summary_text']
153
- # print(chunk_summary)
154
- total_summary += chunk_summary
155
-
156
- return total_summary
157
-
158
- tot_summary=summarize_text(tot_text)
159
- return tot_text,tot_summary,tot_keywords
160
-
161
-
162
-
163
-
164
- # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
165
- def generate_word_cloud(text):
166
- # Create a WordCloud object
167
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
168
-
169
- # Display the generated word cloud
170
- fig, ax = plt.subplots(figsize=(10, 5))
171
-
172
- # Plot the word cloud on the axis
173
- ax.imshow(wordcloud, interpolation='bilinear')
174
- ax.axis('off')
175
- st.pyplot(fig)
176
-
177
-
178
- def main():
179
- st.title("Meeting Summary Web App")
180
-
181
- # YouTube link input
182
- youtube_url = st.text_input("Enter the YouTube video link")
183
-
184
- if st.button("Process Video"):
185
- if youtube_url:
186
- # Process the YouTube video
187
- tot_text, tot_summary, tot_keywords = process_video(youtube_url)
188
-
189
- # Display the output
190
- if os.path.exists("output2.csv"):
191
- output_df = pd.read_csv("output2.csv")
192
- st.subheader("Transcriptions:")
193
- st.write(output_df["transcriptions"])
194
-
195
- st.subheader("Labels:")
196
- st.write(output_df["labels"])
197
-
198
- st.subheader("Word Cloud:")
199
- generate_word_cloud(output_df["transcriptions"].str.cat(sep=' '))
200
-
201
- st.subheader("tot_text:")
202
- st.write(tot_text)
203
-
204
- st.subheader("tot_summary:")
205
- st.write(tot_summary)
206
-
207
- st.subheader("tot_keywords:")
208
- st.write(tot_keywords)
209
-
210
- else:
211
- st.write("No output file found.")
212
 
213
- if __name__ == "__main__":
214
- main()
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ x = st.slider('Select a value')
4
+ st.write(x, 'squared is', x * x)