Keane Moraes commited on
Commit
aec1dec
1 Parent(s): e9d1d9f

mutlithreading primary implementation works

Browse files
Files changed (2) hide show
  1. app.py +21 -8
  2. transcription.py +57 -32
app.py CHANGED
@@ -42,7 +42,7 @@ data_transcription = {"title":"", "text":""}
42
  embeddings = []
43
  text_chunks_lib = dict()
44
  user_input = None
45
- title_entry = None
46
 
47
  tldr = ""
48
  summary = ""
@@ -65,6 +65,8 @@ st.write('It provides a summary, transcription, key insights, a mind map and a Q
65
  bar = st.progress(0)
66
 
67
  def generate_word_embeddings():
 
 
68
  if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
69
  for i, segment in enumerate(segments):
70
  bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
@@ -85,12 +87,20 @@ def generate_word_embeddings():
85
 
86
 
87
  def generate_text_chunks_lib():
88
- text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
89
- input_accepted = True
 
 
 
 
 
 
90
 
91
  # For each body of text, create text chunks of a certain token size required for the transformer
 
 
92
  title_entry = text_df['title'][0]
93
- print(title_entry)
94
  for i in range(0, len(text_df)):
95
  nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
96
  # For each chunk of sentences (within the token max)
@@ -106,6 +116,7 @@ def generate_text_chunks_lib():
106
  keywords = key_engine.get_keywords(text_chunks_lib)
107
 
108
 
 
109
  # =========== SIDEBAR FOR GENERATION ===========
110
  with st.sidebar:
111
  youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
@@ -171,6 +182,7 @@ with st.sidebar:
171
  # Generate embeddings
172
  thread1 = Thread(target=generate_word_embeddings)
173
  thread1.start()
 
174
  # Generate text chunks
175
  thread2 = Thread(target=generate_text_chunks_lib)
176
  thread2.start()
@@ -181,20 +193,21 @@ with st.sidebar:
181
 
182
  # Generate the summary
183
  if gen_summary == 'Yes':
 
184
  se = TextSummarizer(title_entry)
185
  text_transcription = data_transcription['text']
186
  with st.spinner("Generating summary and TLDR..."):
 
187
  summary = se.generate_full_summary(text_chunks_lib)
188
  summary_list = summary.split("\n\n")
189
  tldr = se.generate_short_summary(summary_list)
190
-
191
  # Generate key takeaways
192
  kt = KeyTakeaways()
193
  with st.spinner("Generating key takeaways ... "):
194
  takeaways = kt.generate_key_takeaways(text_chunks_lib)
195
-
196
- is_completed_analysis = True
197
- bar.progress(100)
198
 
199
  if is_completed_analysis:
200
  st.header("Key Takeaways")
 
42
  embeddings = []
43
  text_chunks_lib = dict()
44
  user_input = None
45
+ title_entry = ""
46
 
47
  tldr = ""
48
  summary = ""
 
65
  bar = st.progress(0)
66
 
67
  def generate_word_embeddings():
68
+ global data
69
+
70
  if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
71
  for i, segment in enumerate(segments):
72
  bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
 
87
 
88
 
89
  def generate_text_chunks_lib():
90
+
91
+ global title_entry, text_chunks_lib
92
+ global keywords
93
+ global tldr
94
+ global summary
95
+ global takeaways
96
+ global input_accepted
97
+ global data_transcription
98
 
99
  # For each body of text, create text chunks of a certain token size required for the transformer
100
+ text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
101
+ input_accepted = True
102
  title_entry = text_df['title'][0]
103
+ print("\n\nFIRST TITLE_ENTRY", title_entry)
104
  for i in range(0, len(text_df)):
105
  nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
106
  # For each chunk of sentences (within the token max)
 
116
  keywords = key_engine.get_keywords(text_chunks_lib)
117
 
118
 
119
+
120
  # =========== SIDEBAR FOR GENERATION ===========
121
  with st.sidebar:
122
  youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
 
182
  # Generate embeddings
183
  thread1 = Thread(target=generate_word_embeddings)
184
  thread1.start()
185
+
186
  # Generate text chunks
187
  thread2 = Thread(target=generate_text_chunks_lib)
188
  thread2.start()
 
193
 
194
  # Generate the summary
195
  if gen_summary == 'Yes':
196
+ print("\n\nTITLE ENTRY: ", title_entry)
197
  se = TextSummarizer(title_entry)
198
  text_transcription = data_transcription['text']
199
  with st.spinner("Generating summary and TLDR..."):
200
+ print("\n\nTEXT_CHNK_SUMMARY\n\n", text_chunks_lib)
201
  summary = se.generate_full_summary(text_chunks_lib)
202
  summary_list = summary.split("\n\n")
203
  tldr = se.generate_short_summary(summary_list)
204
+
205
  # Generate key takeaways
206
  kt = KeyTakeaways()
207
  with st.spinner("Generating key takeaways ... "):
208
  takeaways = kt.generate_key_takeaways(text_chunks_lib)
209
+ is_completed_analysis = True
210
+ bar.progress(100)
 
211
 
212
  if is_completed_analysis:
213
  st.header("Key Takeaways")
transcription.py CHANGED
@@ -25,6 +25,7 @@ from nltk import tokenize
25
  # For other stuff
26
  import os, re
27
  import time, math
 
28
 
29
  # USEFUL CONSTANTS
30
 
@@ -53,7 +54,7 @@ class DownloadAudio:
53
  """Returns the title of the youtube video"""
54
  return self.yt["title"]
55
 
56
- def download(self, pathname:str) -> str:
57
  """
58
  Download the audio from the youtube video and saves it to multiple .wav files
59
  in the specified folder. Returns a list of the paths to the .wav files.
@@ -93,30 +94,31 @@ class DownloadAudio:
93
  # If the total duration is less than the duration of each segment,
94
  # then just return the original file
95
  if total_byte_size < MAX_FILE_SIZE_BYTES:
96
- return FINAL_WAV_PATH
97
-
98
- # # Get the size of the wav file
99
- # channels = audio.channels
100
- # sample_width = audio.sample_width
101
- # duration_in_sec = math.ceil(len(audio) / 1000)
102
- # sample_rate = audio.frame_rate
103
- # bit_rate = sample_width * 8
104
- # wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
105
-
106
- # # Get the length of each chunk in milliseconds and make the chunks
107
- # chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size) #in sec
108
- # chunk_length_ms = chunk_length_in_sec * 1000
109
- # chunks = make_chunks(audio, chunk_length_ms)
110
-
111
- # # Export all of the individual chunks as wav files
112
- # chunk_names = []
113
- # for i, chunk in enumerate(chunks):
114
- # chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
115
- # output_chunk_path = f"{pathname}/{chunk_name}"
116
- # chunk_names.append(output_chunk_path)
117
- # chunk.export(f"{output_chunk_path}", format="wav")
 
118
 
119
- return FINAL_WAV_PATH
120
 
121
 
122
  class VideoTranscription:
@@ -150,18 +152,40 @@ class VideoTranscription:
150
  audio_file = DownloadAudio(self.datalink)
151
 
152
  # Get the names of the stored wav files
153
- original_file_name = audio_file.download(FOLDER_NAME)
154
- print(original_file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # Get the transcription of each audio chunk
156
- text_transcriptions = ""
157
  # for file_name in file_names:
158
  # Get the transcription
159
- chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
160
- for chunk_segment in chunk_segments:
161
- text_transcriptions += chunk_segment.text.replace("$", "\$")
 
 
162
 
163
  # Tokenize each sentence of the transcription.
164
- sentences = tokenize.sent_tokenize(text_transcriptions)
165
  segments = []
166
  for i, sentence in enumerate(sentences):
167
  segment = {
@@ -171,9 +195,10 @@ class VideoTranscription:
171
  }
172
  segments.append(segment)
173
 
 
174
  final_transcription = {
175
  "title": audio_file.get_yt_title(),
176
- "text": text_transcriptions,
177
  "segments": segments
178
  }
179
 
 
25
  # For other stuff
26
  import os, re
27
  import time, math
28
+ from threading import Thread
29
 
30
  # USEFUL CONSTANTS
31
 
 
54
  """Returns the title of the youtube video"""
55
  return self.yt["title"]
56
 
57
+ def download(self, pathname:str) -> list:
58
  """
59
  Download the audio from the youtube video and saves it to multiple .wav files
60
  in the specified folder. Returns a list of the paths to the .wav files.
 
94
  # If the total duration is less than the duration of each segment,
95
  # then just return the original file
96
  if total_byte_size < MAX_FILE_SIZE_BYTES:
97
+ return [FINAL_WAV_PATH]
98
+
99
+ # Get the size of the wav file
100
+ channels = audio.channels
101
+ sample_width = audio.sample_width
102
+ duration_in_sec = math.ceil(len(audio) / 1000)
103
+ sample_rate = audio.frame_rate
104
+ bit_rate = sample_width * 8
105
+ wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
106
+
107
+ # Get the length of each chunk in milliseconds and make the chunks
108
+ chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size) #in sec
109
+ chunk_length_ms = chunk_length_in_sec * 1000
110
+ chunks = make_chunks(audio, chunk_length_ms)
111
+
112
+ # Export all of the individual chunks as wav files
113
+ chunk_names = []
114
+ for i, chunk in enumerate(chunks):
115
+ print(f"exporting chunk {i}")
116
+ chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
117
+ output_chunk_path = f"{pathname}/{chunk_name}"
118
+ chunk_names.append(output_chunk_path)
119
+ chunk.export(f"{output_chunk_path}", format="wav")
120
 
121
+ return chunk_names
122
 
123
 
124
  class VideoTranscription:
 
152
  audio_file = DownloadAudio(self.datalink)
153
 
154
  # Get the names of the stored wav files
155
+ file_names = audio_file.download(FOLDER_NAME)
156
+ print("FILE NAMES", file_names)
157
+ text_transcriptions = [""] * len(file_names)
158
+
159
+ def perform_transcription(file_name, i):
160
+ print("transcribing", file_name, " for ", i)
161
+ chunk_segments, _ = self.model.transcribe(file_name, beam_size=5)
162
+ for chunk_segment in chunk_segments:
163
+ text_transcriptions[i] += chunk_segment.text.replace("$", "\$")
164
+
165
+ # Initialize the threads
166
+ threads = []
167
+ for i, file_name in enumerate(file_names):
168
+ threads.append(Thread(target=perform_transcription, args=(file_name, i)))
169
+
170
+ # Start the threads
171
+ for thread in threads:
172
+ thread.start()
173
+
174
+ # Wait for the threads to finish
175
+ for thread in threads:
176
+ thread.join()
177
+
178
  # Get the transcription of each audio chunk
 
179
  # for file_name in file_names:
180
  # Get the transcription
181
+ # chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
182
+ # for chunk_segment in chunk_segments:
183
+ # text_transcriptions += chunk_segment.text.replace("$", "\$")
184
+
185
+ final_text_transcription = " ".join(text_transcriptions)
186
 
187
  # Tokenize each sentence of the transcription.
188
+ sentences = tokenize.sent_tokenize(final_text_transcription)
189
  segments = []
190
  for i, sentence in enumerate(sentences):
191
  segment = {
 
195
  }
196
  segments.append(segment)
197
 
198
+
199
  final_transcription = {
200
  "title": audio_file.get_yt_title(),
201
+ "text": final_text_transcription,
202
  "segments": segments
203
  }
204