Spaces:

romsyflux
/

whisper-diarization

Runtime error

App Files Files Community

romsyflux commited on May 4

Commit

5a1ff9c

•

1 Parent(s): e4ba58a

Updated match logic

Browse files

Files changed (1) hide show

app.py +20 -10

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pyannote.audio import Pipeline
 import os
 from dotenv import load_dotenv
 import plotly.graph_objects as go
 load_dotenv()
@@ -88,7 +89,7 @@ def transcribe(sr, data):
     processed_data = np.array(data).astype(np.float32) / 32767.0
     # results from the pipeline
-    transcription_res = pipe({"sampling_rate": sr, "raw": processed_data})["text"]
     return transcription_res
@@ -104,17 +105,26 @@ def transcribe_diarize(audio):
     diarization_res = diarization_pipeline(
         {"waveform": waveform_tensor, "sample_rate": sr}
     )
     # Get diarization information
     starts, ends, speakers = diarization_info(diarization_res)
     # results from the transcription pipeline
-    diarized_transcription = ""
-    # Get transcription results for each speaker segment
-    for start_time, end_time, speaker_id in zip(starts, ends, speakers):
-        segment = data[int(start_time * sr) : int(end_time * sr)]
-        diarized_transcription += f"{speaker_id} {round(start_time, 2)}:{round(end_time, 2)} \t {transcribe(sr, segment)}\n"
     # Plot diarization
     diarization_plot = plot_diarization(starts, ends, speakers)
@@ -132,8 +142,8 @@ demo = gr.Interface(
         gr.Plot(label="Visualization"),
     ],
     examples=["sample1.wav"],
-    title="Automatic Speech Recognition with Diarization 🗣️ Whisper V3 Large & Pyannote Speaker Diarization V3.1",
-    description="Transcribe your speech to text with distilled whisper and diarization with pyannote. Get started by recording from your mic or uploading an audio file (.wav) 🎙️",
 )
 if __name__ == "__main__":

 import os
 from dotenv import load_dotenv
 import plotly.graph_objects as go
+from .utils.diarize_utils import match_segments
 load_dotenv()
     processed_data = np.array(data).astype(np.float32) / 32767.0
     # results from the pipeline
+    transcription_res = pipe({"sampling_rate": sr, "raw": processed_data},return_timestamps=True)
     return transcription_res
     diarization_res = diarization_pipeline(
         {"waveform": waveform_tensor, "sample_rate": sr}
     )
+    dia_seg, dia_label = [], []
+    for segment, _, label in diarization_res.itertracks(yield_label=True):
+        dia_seg.append([segment.start, segment.end])
+        dia_label.append(label)
+    assert (
+            dia_seg
+        ), "The result from the diarization pipeline: `diarization_segments` is empty. No segments found from the diarization process."
+    segmented_preds = transcription_res["chunks"]
+    dia_seg = np.array(dia_seg)
+    asr_seg = np.array([[*chunk["timestamp"]] for chunk in segmented_preds])
+    asr_labels = match_segments(dia_seg, dia_label, asr_seg, threshold=0.0, no_match_label="NO_SPEAKER")
+    for i, label in enumerate(asr_labels):
+        segmented_preds[i]["speaker"] = label
     # Get diarization information
     starts, ends, speakers = diarization_info(diarization_res)
     # results from the transcription pipeline
+    diarized_transcription = segmented_preds
     # Plot diarization
     diarization_plot = plot_diarization(starts, ends, speakers)
         gr.Plot(label="Visualization"),
     ],
     examples=["sample1.wav"],
+    title="Automatic Speech Recognition with Diarization 🗣️",
+    description="Whisper V3 Large & Pyannote Speaker Diarization V3.1 \nTranscribe your speech to text with distilled whisper and diarization with pyannote. Get started by recording from your mic or uploading an audio file (.wav) 🎙️",
 )
 if __name__ == "__main__":