Spaces:

lordvader31
/

almithal

Running

App Files Files Community

Keane Moraes commited on May 5, 2023

Commit

aec1dec

1 Parent(s): e9d1d9f

mutlithreading primary implementation works

Browse files

Files changed (2) hide show

app.py +21 -8
transcription.py +57 -32

app.py CHANGED Viewed

@@ -42,7 +42,7 @@ data_transcription = {"title":"", "text":""}
 embeddings = []
 text_chunks_lib = dict()
 user_input = None
-title_entry = None
 tldr = ""
 summary = ""
@@ -65,6 +65,8 @@ st.write('It provides a summary, transcription, key insights, a mind map and a Q
 bar = st.progress(0)
 def generate_word_embeddings():
     if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
         for i, segment in enumerate(segments):
             bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
@@ -85,12 +87,20 @@ def generate_word_embeddings():
 def generate_text_chunks_lib():
-    text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
-    input_accepted = True
     # For each body of text, create text chunks of a certain token size required for the transformer
     title_entry = text_df['title'][0]
-    print(title_entry)
     for i in range(0, len(text_df)):
         nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
         # For each chunk of sentences (within the token max)
@@ -106,6 +116,7 @@ def generate_text_chunks_lib():
     keywords = key_engine.get_keywords(text_chunks_lib)
 # =========== SIDEBAR FOR GENERATION ===========
 with st.sidebar:
     youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
@@ -171,6 +182,7 @@ with st.sidebar:
         # Generate embeddings
         thread1 = Thread(target=generate_word_embeddings)
         thread1.start()
         # Generate text chunks
         thread2 = Thread(target=generate_text_chunks_lib)
         thread2.start()
@@ -181,20 +193,21 @@ with st.sidebar:
         # Generate the summary
         if gen_summary == 'Yes':
             se = TextSummarizer(title_entry)
             text_transcription = data_transcription['text']
             with st.spinner("Generating summary and TLDR..."):
                 summary = se.generate_full_summary(text_chunks_lib)
                 summary_list = summary.split("\n\n")
                 tldr = se.generate_short_summary(summary_list)
         # Generate key takeaways
         kt = KeyTakeaways()
         with st.spinner("Generating key takeaways ... "):
             takeaways = kt.generate_key_takeaways(text_chunks_lib)
-        is_completed_analysis = True
-        bar.progress(100)
 if is_completed_analysis:
     st.header("Key Takeaways")

 embeddings = []
 text_chunks_lib = dict()
 user_input = None
+title_entry = ""
 tldr = ""
 summary = ""
 bar = st.progress(0)
 def generate_word_embeddings():
+    global data
     if not os.path.exists(f"{folder_name}/word_embeddings.csv"):
         for i, segment in enumerate(segments):
             bar.progress(max(math.ceil((i/len(segments) * 50)), 1))
 def generate_text_chunks_lib():
+    global title_entry, text_chunks_lib
+    global keywords
+    global tldr
+    global summary
+    global takeaways
+    global input_accepted
+    global data_transcription
     # For each body of text, create text chunks of a certain token size required for the transformer
+    text_df = pd.DataFrame.from_dict({"title": [data_transcription["title"]], "text":[data_transcription["text"]]})
+    input_accepted = True
     title_entry = text_df['title'][0]
+    print("\n\nFIRST TITLE_ENTRY", title_entry)
     for i in range(0, len(text_df)):
         nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
         # For each chunk of sentences (within the token max)
     keywords = key_engine.get_keywords(text_chunks_lib)
 # =========== SIDEBAR FOR GENERATION ===========
 with st.sidebar:
     youtube_link = st.text_input(label = "Type in your Youtube link", placeholder = "", key="url")
         # Generate embeddings
         thread1 = Thread(target=generate_word_embeddings)
         thread1.start()
         # Generate text chunks
         thread2 = Thread(target=generate_text_chunks_lib)
         thread2.start()
         # Generate the summary
         if gen_summary == 'Yes':
+            print("\n\nTITLE ENTRY: ", title_entry)
             se = TextSummarizer(title_entry)
             text_transcription = data_transcription['text']
             with st.spinner("Generating summary and TLDR..."):
+                print("\n\nTEXT_CHNK_SUMMARY\n\n", text_chunks_lib)
                 summary = se.generate_full_summary(text_chunks_lib)
                 summary_list = summary.split("\n\n")
                 tldr = se.generate_short_summary(summary_list)
         # Generate key takeaways
         kt = KeyTakeaways()
         with st.spinner("Generating key takeaways ... "):
             takeaways = kt.generate_key_takeaways(text_chunks_lib)
+            is_completed_analysis = True
+            bar.progress(100)
 if is_completed_analysis:
     st.header("Key Takeaways")

transcription.py CHANGED Viewed

@@ -25,6 +25,7 @@ from nltk import tokenize
 # For other stuff
 import os, re
 import time, math
 # USEFUL CONSTANTS
@@ -53,7 +54,7 @@ class DownloadAudio:
         """Returns the title of the youtube video"""
         return self.yt["title"]
-    def download(self, pathname:str) -> str:
         """
         Download the audio from the youtube video and saves it to multiple .wav files
         in the specified folder. Returns a list of the paths to the .wav files.
@@ -93,30 +94,31 @@ class DownloadAudio:
         # If the total duration is less than the duration of each segment,
         # then just return the original file
         if total_byte_size < MAX_FILE_SIZE_BYTES:
-            return FINAL_WAV_PATH
-        # # Get the size of the wav file
-        # channels = audio.channels
-        # sample_width = audio.sample_width
-        # duration_in_sec = math.ceil(len(audio) / 1000)
-        # sample_rate = audio.frame_rate
-        # bit_rate = sample_width * 8
-        # wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
-        # # Get the length of each chunk in milliseconds and make the chunks
-        # chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size)   #in sec
-        # chunk_length_ms = chunk_length_in_sec * 1000
-        # chunks = make_chunks(audio, chunk_length_ms)
-        # # Export all of the individual chunks as wav files
-        # chunk_names = []
-        # for i, chunk in enumerate(chunks):
-        #     chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
-        #     output_chunk_path = f"{pathname}/{chunk_name}"
-        #     chunk_names.append(output_chunk_path)
-        #     chunk.export(f"{output_chunk_path}", format="wav")
-        return FINAL_WAV_PATH
 class VideoTranscription:
@@ -150,18 +152,40 @@ class VideoTranscription:
         audio_file = DownloadAudio(self.datalink)
         # Get the names of the stored wav files
-        original_file_name = audio_file.download(FOLDER_NAME)
-        print(original_file_name)
         # Get the transcription of each audio chunk
-        text_transcriptions = ""
         # for file_name in file_names:
         # Get the transcription
-        chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
-        for chunk_segment in chunk_segments:
-            text_transcriptions += chunk_segment.text.replace("$", "\$")
         # Tokenize each sentence of the transcription.
-        sentences = tokenize.sent_tokenize(text_transcriptions)
         segments = []
         for i, sentence in enumerate(sentences):
             segment = {
@@ -171,9 +195,10 @@ class VideoTranscription:
             }
             segments.append(segment)
         final_transcription = {
             "title": audio_file.get_yt_title(),
-            "text": text_transcriptions,
             "segments": segments
         }

 # For other stuff
 import os, re
 import time, math
+from threading import Thread
 # USEFUL CONSTANTS
         """Returns the title of the youtube video"""
         return self.yt["title"]
+    def download(self, pathname:str) -> list:
         """
         Download the audio from the youtube video and saves it to multiple .wav files
         in the specified folder. Returns a list of the paths to the .wav files.
         # If the total duration is less than the duration of each segment,
         # then just return the original file
         if total_byte_size < MAX_FILE_SIZE_BYTES:
+            return [FINAL_WAV_PATH]
+        # Get the size of the wav file
+        channels = audio.channels
+        sample_width = audio.sample_width
+        duration_in_sec = math.ceil(len(audio) / 1000)
+        sample_rate = audio.frame_rate
+        bit_rate = sample_width * 8
+        wav_file_size = (sample_rate * bit_rate * channels * duration_in_sec) / 8
+        # Get the length of each chunk in milliseconds and make the chunks
+        chunk_length_in_sec = math.ceil((duration_in_sec * MAX_FILE_SIZE_BYTES ) / wav_file_size)   #in sec
+        chunk_length_ms = chunk_length_in_sec * 1000
+        chunks = make_chunks(audio, chunk_length_ms)
+        # Export all of the individual chunks as wav files
+        chunk_names = []
+        for i, chunk in enumerate(chunks):
+            print(f"exporting chunk {i}")
+            chunk_name = f"{self.YOUTUBE_VIDEO_ID}_{i}.wav"
+            output_chunk_path = f"{pathname}/{chunk_name}"
+            chunk_names.append(output_chunk_path)
+            chunk.export(f"{output_chunk_path}", format="wav")
+        return chunk_names
 class VideoTranscription:
         audio_file = DownloadAudio(self.datalink)
         # Get the names of the stored wav files
+        file_names = audio_file.download(FOLDER_NAME)
+        print("FILE NAMES", file_names)
+        text_transcriptions = [""] * len(file_names)
+        def perform_transcription(file_name, i):
+            print("transcribing", file_name, " for ", i)
+            chunk_segments, _ = self.model.transcribe(file_name, beam_size=5)
+            for chunk_segment in chunk_segments:
+                text_transcriptions[i] += chunk_segment.text.replace("$", "\$")
+        # Initialize the threads
+        threads = []
+        for i, file_name in enumerate(file_names):
+            threads.append(Thread(target=perform_transcription, args=(file_name, i)))
+        # Start the threads
+        for thread in threads:
+            thread.start()
+        # Wait for the threads to finish
+        for thread in threads:
+            thread.join()
         # Get the transcription of each audio chunk
         # for file_name in file_names:
         # Get the transcription
+        # chunk_segments, _ = self.model.transcribe(original_file_name, beam_size=5)
+        # for chunk_segment in chunk_segments:
+        #     text_transcriptions += chunk_segment.text.replace("$", "\$")
+        final_text_transcription = " ".join(text_transcriptions)
         # Tokenize each sentence of the transcription.
+        sentences = tokenize.sent_tokenize(final_text_transcription)
         segments = []
         for i, sentence in enumerate(sentences):
             segment = {
             }
             segments.append(segment)
         final_transcription = {
             "title": audio_file.get_yt_title(),
+            "text": final_text_transcription,
             "segments": segments
         }