Bookie-Whisper-capitalised-Macedonian-ASR

Sleeping

App Files Files Community

Porjaz commited on Oct 22, 2024

Commit

cf075ed

verified ·

1 Parent(s): eb43650

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -12

app.py CHANGED Viewed

@@ -56,18 +56,14 @@ def return_prediction_w2v2(mic=None, file=None, device=device):
 @spaces.GPU(duration=30)
-def return_prediction_whisper(mic=None, file=None, device=device):
     if mic is not None:
         waveform, sr = librosa.load(mic, sr=16000)
-        waveform = waveform[:60*sr]
-        whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
-    elif file is not None:
-        waveform, sr = librosa.load(file, sr=16000)
-        waveform = waveform[:60*sr]
         whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
     else:
-        return "You must either provide a mic recording or a file"
     recap_result = recap_sentence(whisper_result[0])
     # If the letter after punct is small, recap it
@@ -79,6 +75,39 @@ def return_prediction_whisper(mic=None, file=None, device=device):
     return recap_result
 def return_prediction_compare(mic=None, file=None, device=device):
     # pipe_whisper.model.to(device)
     # mms_model.to(device)
@@ -132,7 +161,8 @@ def return_prediction_compare(mic=None, file=None, device=device):
 # Create a partial function with the device pre-applied
-return_prediction_whisper_with_device = partial(return_prediction_whisper, device=device)
 return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
 # Load the ASR models
@@ -153,7 +183,7 @@ recap_model.eval()
 mic_transcribe_whisper = gr.Interface(
-    fn=return_prediction_whisper_with_device,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
@@ -161,11 +191,11 @@ mic_transcribe_whisper = gr.Interface(
 )
 file_transcribe_whisper = gr.Interface(
-    fn=return_prediction_whisper_with_device,
     inputs=gr.Audio(sources="upload", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
-    live=False
 )
 mic_transcribe_w2v2 = gr.Interface(

 @spaces.GPU(duration=30)
+def return_prediction_whisper_mic(mic=None, device=device):
     if mic is not None:
         waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:30*sr]
         whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
     else:
+        return "You must provide a mic recording"
     recap_result = recap_sentence(whisper_result[0])
     # If the letter after punct is small, recap it
     return recap_result
+@spaces.GPU(duration=120)
+def return_prediction_whisper_file(file, device=device):
+    waveform, sr = librosa.load(file, sr=16000)
+    waveform = waveform[:3600*sr]
+    whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
+    recap_result = ""
+    prev_segment = ""
+    prev_segment_len = 0
+    segment_counter = 0
+    for segment in whisper_result:
+        segment_counter += 1
+        if prev_segment == "":
+            recap_segment= recap_sentence(segment[0])
+        else:
+            prev_segment_len = len(prev_segment.split())
+            recap_segment = recap_sentence(prev_segment + " " + segment[0])
+        # remove prev_segment from the beginning of the recap_result
+        recap_segment = recap_segment.split()
+        recap_segment = recap_segment[prev_segment_len:]
+        recap_segment = " ".join(recap_segment)
+        prev_segment = segment[0]
+        recap_result += recap_segment + " "
+            # If the letter after punct is small, recap it
+        for i, letter in enumerate(recap_result):
+            if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
+                recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
+        yield recap_result
 def return_prediction_compare(mic=None, file=None, device=device):
     # pipe_whisper.model.to(device)
     # mms_model.to(device)
 # Create a partial function with the device pre-applied
+return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
+return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
 return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
 # Load the ASR models
 mic_transcribe_whisper = gr.Interface(
+    fn=return_prediction_whisper_mic_with_device,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
 )
 file_transcribe_whisper = gr.Interface(
+    fn=return_prediction_whisper_file_with_device,
     inputs=gr.Audio(sources="upload", type="filepath"),
     outputs=gr.Textbox(),
     allow_flagging="never",
+    live=True
 )
 mic_transcribe_w2v2 = gr.Interface(