whisper-speaker-diarization

Runtime error

App Files Files Community

sanchit-gandhi HF staff commited on Jan 26, 2023

Commit

fc4914b

•

1 Parent(s): ff9897a

Update asr_diarizer.py

Browse files

Files changed (1) hide show

asr_diarizer.py +35 -16

asr_diarizer.py CHANGED Viewed

@@ -16,14 +16,15 @@ class ASRDiarizationPipeline:
         diarization_pipeline,
     ):
         self.asr_pipeline = asr_pipeline
-        self.diarization_pipeline = diarization_pipeline
-        self.sampling_rate = self.asr_pipeline.feature_extractor.sampling_rate
     @classmethod
     def from_pretrained(
         cls,
-        asr_model: Optional[str] = "openai/whisper-small",
         diarizer_model: Optional[str] = "pyannote/speaker-diarization",
         chunk_length_s: Optional[int] = 30,
         use_auth_token: Optional[Union[str, bool]] = True,
@@ -37,7 +38,7 @@ class ASRDiarizationPipeline:
             **kwargs,
         )
         diarization_pipeline = Pipeline.from_pretrained(diarizer_model, use_auth_token=use_auth_token)
-        cls(asr_pipeline, diarization_pipeline)
     def __call__(
         self,
@@ -46,7 +47,13 @@ class ASRDiarizationPipeline:
         **kwargs,
     ):
         """
-        Transcribe the audio sequence(s) given as inputs to text.
         Args:
             inputs (`np.ndarray` or `bytes` or `str` or `dict`):
@@ -62,15 +69,16 @@ class ASRDiarizationPipeline:
                       np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
                       treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                       inference to provide more context to the model). Only use `stride` with CTC models.
         Return:
-            `Dict`: A dictionary with the following keys:
                 - **text** (`str` ) -- The recognized text.
-                - **chunks** (*optional(, `List[Dict]`)
-                        When using `return_timestamps`, the `chunks` will become a list containing all the various text
-                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
-                        "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
-                        `"".join(chunk["text"] for chunk in output["chunks"])`.
         """
         inputs, diarizer_inputs = self.preprocess(inputs)
@@ -81,13 +89,17 @@ class ASRDiarizationPipeline:
         segments = diarization.for_json()["content"]
         new_segments = []
         prev_segment = cur_segment = segments[0]
         for i in range(1, len(segments)):
             cur_segment = segments[i]
             if cur_segment["label"] != prev_segment["label"] and i < len(segments):
                 new_segments.append(
                     {
                         "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["start"]},
@@ -96,6 +108,7 @@ class ASRDiarizationPipeline:
                 )
                 prev_segment = segments[i]
         new_segments.append(
             {
                 "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["end"]},
@@ -110,11 +123,15 @@ class ASRDiarizationPipeline:
         )
         transcript = asr_out["chunks"]
         end_timestamps = np.array([chunk["timestamp"][-1] for chunk in transcript])
         segmented_preds = []
         for segment in new_segments:
             end_time = segment["segment"]["end"]
             upto_idx = np.argmin(np.abs(end_timestamps - end_time))
             if group_by_speaker:
@@ -122,21 +139,21 @@ class ASRDiarizationPipeline:
                     {
                         "speaker": segment["speaker"],
                         "text": "".join([chunk["text"] for chunk in transcript[: upto_idx + 1]]),
-                        "timestamp": {
-                            "start": transcript[0]["timestamp"][0],
-                            "end": transcript[upto_idx]["timestamp"][1],
-                        },
                     }
                 )
             else:
                 for i in range(upto_idx + 1):
                     segmented_preds.append({"speaker": segment["speaker"], **transcript[i]})
             transcript = transcript[upto_idx + 1 :]
             end_timestamps = end_timestamps[upto_idx + 1 :]
         return segmented_preds
     def preprocess(self, inputs):
         if isinstance(inputs, str):
             if inputs.startswith("http://") or inputs.startswith("https://"):
@@ -174,6 +191,8 @@ class ASRDiarizationPipeline:
         if len(inputs.shape) != 1:
             raise ValueError("We expect a single channel audio input for ASRDiarizePipeline")
-        diarizer_inputs = torch.from_numpy(inputs).float().unsqueeze(0)
         return inputs, diarizer_inputs

         diarization_pipeline,
     ):
         self.asr_pipeline = asr_pipeline
+        self.sampling_rate = asr_pipeline.feature_extractor.sampling_rate
+        self.diarization_pipeline = diarization_pipeline
     @classmethod
     def from_pretrained(
         cls,
+        asr_model: Optional[str] = "openai/whisper-medium",
+        *,
         diarizer_model: Optional[str] = "pyannote/speaker-diarization",
         chunk_length_s: Optional[int] = 30,
         use_auth_token: Optional[Union[str, bool]] = True,
             **kwargs,
         )
         diarization_pipeline = Pipeline.from_pretrained(diarizer_model, use_auth_token=use_auth_token)
+        return cls(asr_pipeline, diarization_pipeline)
     def __call__(
         self,
         **kwargs,
     ):
         """
+        Transcribe the audio sequence(s) given as inputs to text and label with speaker information. The input audio
+        is first passed to the speaker diarization pipeline, which returns timestamps for 'who spoke when'. The audio
+        is then passed to the ASR pipeline, which returns utterance-level transcriptions and their corresponding
+        timestamps. The speaker diarizer timestamps are aligned with the ASR transcription timestamps to give
+        speaker-labelled transcriptions. We cannot use the speaker diarization timestamps alone to partition the
+        transcriptions, as these timestamps may straddle across transcribed utterances from the ASR output. Thus, we
+        find the diarizer timestamps that are closest to the ASR timestamps and partition here.
         Args:
             inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                       np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
                       treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                       inference to provide more context to the model). Only use `stride` with CTC models.
+            group_by_speaker (`bool`):
+                Whether to group consecutive utterances by one speaker into a single segment. If False, will return
+                transcriptions on a chunk-by-chunk basis.
         Return:
+            A list of transcriptions. Each list item corresponds to one chunk / segment of transcription, and is a
+            dictionary with the following keys:
                 - **text** (`str` ) -- The recognized text.
+                - **speaker** (`str`) -- The associated speaker.
+                - **timestamps** (`tuple`) -- The start and end time for the chunk / segment.
         """
         inputs, diarizer_inputs = self.preprocess(inputs)
         segments = diarization.for_json()["content"]
+        # diarizer output may contain consecutive segments from the same speaker (e.g. {(0 -> 1, speaker_1), (1 -> 1.5, speaker_1), ...})
+        # we combine these segments to give overall timestamps for each speaker's turn (e.g. {(0 -> 1.5, speaker_1), ...})
         new_segments = []
         prev_segment = cur_segment = segments[0]
         for i in range(1, len(segments)):
             cur_segment = segments[i]
+            # check if we have changed speaker ("label")
             if cur_segment["label"] != prev_segment["label"] and i < len(segments):
+                # add the start/end times for the super-segment to the new list
                 new_segments.append(
                     {
                         "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["start"]},
                 )
                 prev_segment = segments[i]
+        # add the last segment(s) if there was no speaker change
         new_segments.append(
             {
                 "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["end"]},
         )
         transcript = asr_out["chunks"]
+        # get the end timestamps for each chunk from the ASR output
         end_timestamps = np.array([chunk["timestamp"][-1] for chunk in transcript])
         segmented_preds = []
+        # align the diarizer timestamps and the ASR timestamps
         for segment in new_segments:
+            # get the diarizer end timestamp
             end_time = segment["segment"]["end"]
+            # find the ASR end timestamp that is closest to the diarizer's end timestamp and cut the transcript to here
             upto_idx = np.argmin(np.abs(end_timestamps - end_time))
             if group_by_speaker:
                     {
                         "speaker": segment["speaker"],
                         "text": "".join([chunk["text"] for chunk in transcript[: upto_idx + 1]]),
+                        "timestamp": (transcript[0]["timestamp"][0], transcript[upto_idx]["timestamp"][1]),
                     }
                 )
             else:
                 for i in range(upto_idx + 1):
                     segmented_preds.append({"speaker": segment["speaker"], **transcript[i]})
+            # crop the transcripts and timestamp lists according to the latest timestamp (for faster argmin)
             transcript = transcript[upto_idx + 1 :]
             end_timestamps = end_timestamps[upto_idx + 1 :]
         return segmented_preds
+    # Adapted from transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline.preprocess
+    # (see https://github.com/huggingface/transformers/blob/238449414f88d94ded35e80459bb6412d8ab42cf/src/transformers/pipelines/automatic_speech_recognition.py#L417)
     def preprocess(self, inputs):
         if isinstance(inputs, str):
             if inputs.startswith("http://") or inputs.startswith("https://"):
         if len(inputs.shape) != 1:
             raise ValueError("We expect a single channel audio input for ASRDiarizePipeline")
+        # diarization model expects float32 torch tensor of shape `(channels, seq_len)`
+        diarizer_inputs = torch.from_numpy(inputs).float()
+        diarizer_inputs = diarizer_inputs.unsqueeze(0)
         return inputs, diarizer_inputs