init

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.gitignore +12 -0
README.md +138 -195
pipeline/kotoba_whisper.py +315 -0
pipeline/push_pipeline.py +23 -0
pipeline/test_pipeline.py +7 -0
pipeline/test_speaker_diarization.py +48 -0
sample_audio/sample_diarization_japanese.mp3 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample_audio/* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.idea
+*.egg-info
+build
+dist
+*.ipynb_checkpoints
+.DS_Store
+.python-version
+*.pyc
+__pycache__
+*.nfs000*
+.eggs

README.md CHANGED Viewed

@@ -1,199 +1,142 @@
 ---
 library_name: transformers
-tags: []
 ---
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]

 ---
+language: ja
 library_name: transformers
+license: apache-2.0
+tags:
+- audio
+- automatic-speech-recognition
+- hf-asr-leaderboard
+widget:
+- example_title: Sample 1
+  src: >-
+    https://huggingface.co/datasets/japanese-asr/ja_asr.common_voice_8_0/resolve/main/sample.flac
+pipeline_tag: automatic-speech-recognition
 ---
+# Kotoba-Whisper-v2.2
+_Kotoba-Whisper-v2.2_ is a Japanese ASR model based on [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0), with
+additional postprocessing stacks integrated as [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines). The new features includes
+(i) improved timestamp achieved by [stable-ts](https://github.com/jianfch/stable-ts) and (ii) adding punctuation with [punctuators](https://github.com/1-800-BAD-CODE/punctuators/tree/main).
+These libraries are merged into Kotoba-Whisper-v2.1 via pipeline and will be applied seamlessly to the predicted transcription from [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0).
+The pipeline has been developed through the collaboration between [Asahi Ushio](https://asahiushio.com) and [Kotoba Technologies](https://twitter.com/kotoba_tech)
+Following table presents the raw CER (unlike usual CER where the punctuations are removed before computing the metrics, see the evaluation script [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.1/blob/main/run_short_form_eval.py))
+along with the.
+| model                                                                                                                                             |   [CommonVoice 8 (Japanese test set)](https://huggingface.co/datasets/japanese-asr/ja_asr.common_voice_8_0) |   [JSUT Basic 5000](https://huggingface.co/datasets/japanese-asr/ja_asr.jsut_basic5000) |   [ReazonSpeech (held out test set)](https://huggingface.co/datasets/japanese-asr/ja_asr.reazonspeech_test) |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------:|----------------------------------------------------------------------------------------:|------------------------------------------------------------------------------------------------------------:|
+| [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0)                                                         |                                                                                                        17.6 |                                                                                    15.4 |                                                                                                        17.4 |
+| [kotoba-tech/kotoba-whisper-v2.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.1)                                                         |                                                                                                        17.7 |                                                                                    15.4 |                                                                                                        17   |
+| [kotoba-tech/kotoba-whisper-v2.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.1) (punctuator + stable-ts)                                |                                                                                                        17.7 |                                                                                    15.4 |                                                                                                        17   |
+| [kotoba-tech/kotoba-whisper-v2.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.1) (punctuator)                                            |                                                                                                        17.7 |                                                                                    15.4 |                                                                                                        17   |
+| [kotoba-tech/kotoba-whisper-v2.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.1) (stable-ts)                                             |                                                                                                        17.7 |                                                                                    15.4 |                                                                                                        17   |
+| [kotoba-tech/kotoba-whisper-v1.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.0)                                                         |                                                                                                        17.8 |                                                                                    15.2 |                                                                                                        17.8 |
+| [kotoba-tech/kotoba-whisper-v1.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1)                                                         |                                                                                                        17.9 |                                                                                    15   |                                                                                                        17.8 |
+| [kotoba-tech/kotoba-whisper-v1.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1) (punctuator + stable-ts)                                |                                                                                                        17.9 |                                                                                    15   |                                                                                                        17.8 |
+| [kotoba-tech/kotoba-whisper-v1.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1) (punctuator)                                            |                                                                                                        17.9 |                                                                                    15   |                                                                                                        17.8 |
+| [kotoba-tech/kotoba-whisper-v1.1](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1) (stable-ts)                                             |                                                                                                        17.9 |                                                                                    15   |                                                                                                        17.8 |
+| [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)                                                                         |                                                                                                        15.3 |                                                                                    13.4 |                                                                                                        20.5 |
+| [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2)                                                                         |                                                                                                        15.9 |                                                                                    10.6 |                                                                                                        34.6 |
+| [openai/whisper-large](https://huggingface.co/openai/whisper-large)                                                                               |                                                                                                        16.6 |                                                                                    11.3 |                                                                                                        40.7 |
+| [openai/whisper-medium](https://huggingface.co/openai/whisper-medium)                                                                             |                                                                                                        17.9 |                                                                                    13.1 |                                                                                                        39.3 |
+| [openai/whisper-base](https://huggingface.co/openai/whisper-base)                                                                                 |                                                                                                        34.5 |                                                                                    26.4 |                                                                                                        76   |
+| [openai/whisper-small](https://huggingface.co/openai/whisper-small)                                                                               |                                                                                                        21.5 |                                                                                    18.9 |                                                                                                        48.1 |
+| [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny)                                                                                 |                                                                                                        58.8 |                                                                                    38.3 |                                                                                                       153.3 |
+Regarding to the normalized CER, since those update from v2.1 will be removed by the normalization, kotoba-tech/kotoba-whisper-v2.1 marks the same CER values as [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0).
+### Latency
+Please refer to the section of the latency in the kotoba-whisper-v1.1 [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1#latency).
+## Transformers Usage
+Kotoba-Whisper-v2.1 is supported in the Hugging Face 🤗 Transformers library from version 4.39 onwards. To run the model, first
+install the latest version of Transformers.
+```bash
+pip install --upgrade pip
+pip install --upgrade transformers accelerate torchaudio
+pip install stable-ts==2.16.0
+pip install punctuators==0.0.5
+```
+### Transcription
+The model can be used with the [`pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline)
+class to transcribe audio files as follows:
+```python
+import torch
+from transformers import pipeline
+from datasets import load_dataset
+# config
+model_id = "kotoba-tech/kotoba-whisper-v2.1"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
+generate_kwargs = {"language": "ja", "task": "transcribe"}
+# load model
+pipe = pipeline(
+    model=model_id,
+    torch_dtype=torch_dtype,
+    device=device,
+    model_kwargs=model_kwargs,
+    chunk_length_s=15,
+    batch_size=16,
+    trust_remote_code=True,
+    stable_ts=True,
+    punctuator=True
+)
+# load sample audio
+dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
+sample = dataset[0]["audio"]
+# run inference
+result = pipe(sample, return_timestamps=True, generate_kwargs=generate_kwargs)
+print(result)
+```
+- To transcribe a local audio file, simply pass the path to your audio file when you call the pipeline:
+```diff
+- result = pipe(sample, return_timestamps=True, generate_kwargs=generate_kwargs)
++ result = pipe("audio.mp3", return_timestamps=True, generate_kwargs=generate_kwargs)
+```
+- To deactivate stable-ts:
+```diff
+-     stable_ts=True,
++     stable_ts=False,
+```
+- To deactivate punctuator:
+```diff
+-     punctuator=True,
++     punctuator=False,
+```
+### Flash Attention 2
+We recommend using [Flash-Attention 2](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#flashattention-2)
+if your GPU allows for it. To do so, you first need to install [Flash Attention](https://github.com/Dao-AILab/flash-attention):
+```
+pip install flash-attn --no-build-isolation
+```
+Then pass `attn_implementation="flash_attention_2"` to `from_pretrained`:
+```diff
+- model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
++ model_kwargs = {"attn_implementation": "flash_attention_2"} if torch.cuda.is_available() else {}
+```
+## Acknowledgements
+* [OpenAI](https://openai.com/) for the Whisper [model](https://huggingface.co/openai/whisper-large-v3).
+* Hugging Face 🤗 [Transformers](https://github.com/huggingface/transformers) for the model integration.
+* Hugging Face 🤗 for the [Distil-Whisper codebase](https://github.com/huggingface/distil-whisper).
+* [Reazon Human Interaction Lab](https://research.reazon.jp/) for the [ReazonSpeech dataset](https://huggingface.co/datasets/reazon-research/reazonspeech).

pipeline/kotoba_whisper.py ADDED Viewed

	@@ -0,0 +1,315 @@

+from typing import Union, Optional, Dict, List, Any
+import requests
+import torch
+import numpy as np
+from transformers.pipelines.audio_utils import ffmpeg_read
+from transformers.pipelines.automatic_speech_recognition import AutomaticSpeechRecognitionPipeline, chunk_iter
+from transformers.utils import is_torchaudio_available
+from transformers.modeling_utils import PreTrainedModel
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from pyannote.audio import Pipeline
+from pyannote.core.annotation import Annotation
+from punctuators.models import PunctCapSegModelONNX
+class Punctuator:
+    ja_punctuations = ["!", "?", "、", "。"]
+    def __init__(self, model: str = "pcs_47lang"):
+        self.punctuation_model = PunctCapSegModelONNX.from_pretrained(model)
+    def punctuate(self, pipeline_chunk: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        def validate_punctuation(raw: str, punctuated: str):
+            if 'unk' in punctuated.lower() or any(p in raw for p in self.ja_punctuations):
+                return raw
+            if punctuated.count("。") > 1:
+                ind = punctuated.rfind("。")
+                punctuated = punctuated.replace("。", "")
+                punctuated = punctuated[:ind] + "。" + punctuated[ind:]
+            return punctuated
+        text_edit = self.punctuation_model.infer([c['text'] for c in pipeline_chunk])
+        return [
+            {
+                'timestamp': c['timestamp'],
+                'text': validate_punctuation(c['text'], "".join(e))
+            } for c, e in zip(pipeline_chunk, text_edit)
+        ]
+class SpeakerDiarization:
+    def __init__(self, model_id: str, device: torch.device):
+        self.device = device
+        self.pipeline = Pipeline.from_pretrained(model_id)
+        self.pipeline = self.pipeline.to(self.device)
+    def __call__(self,
+                 audio: Union[str, torch.Tensor, np.ndarray],
+                 sampling_rate: Optional[int] = None) -> Annotation:
+        if type(audio) is torch.Tensor or type(audio) is np.ndarray:
+            if sampling_rate is None:
+                raise ValueError("sampling_rate must be provided")
+            if type(audio) is np.ndarray:
+                audio = torch.as_tensor(audio)
+            audio = torch.as_tensor(audio, dtype=torch.float32)
+            if len(audio.shape) == 1:
+                audio = audio.unsqueeze(0)
+            elif len(audio.shape) > 3:
+                raise ValueError("audio shape must be (channel, time)")
+            audio = {"waveform": audio.to(self.device), "sample_rate": sampling_rate}
+        output = self.pipeline(audio)
+        return output
+class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
+    def __init__(self,
+                 model: "PreTrainedModel",
+                 model_diarizarization: str="pyannote/speaker-diarization-3.1",
+                 feature_extractor: Union["SequenceFeatureExtractor", str] = None,
+                 tokenizer: Optional[PreTrainedTokenizer] = None,
+                 device: Union[int, "torch.device"] = None,
+                 device_diarizarization: Union[int, "torch.device"] = None,
+                 torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+                 return_unique_speaker: bool = False,
+                 punctuator: bool = False,
+                 **kwargs):
+        self.type = "seq2seq_whisper"
+        if device is None:
+            device = "cpu"
+        if device_diarizarization is None:
+            device_diarizarization = device
+        if type(device_diarizarization) is str:
+            device_diarizarization = torch.device(device_diarizarization)
+        self.model_speaker_diarization = SpeakerDiarization(model_diarizarization, device_diarizarization)
+        self.return_unique_speaker = return_unique_speaker
+        if punctuator:
+            self.punctuator = Punctuator()
+        else:
+            self.punctuator = None
+        super().__init__(
+            model=model,
+            feature_extractor=feature_extractor,
+            tokenizer=tokenizer,
+            device=device,
+            torch_dtype=torch_dtype,
+            **kwargs
+        )
+    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+        stride = None
+        extra = {}
+        if isinstance(inputs, dict):
+            stride = inputs.pop("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            extra = inputs
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+            if stride is not None:
+                if stride[0] + stride[1] > inputs.shape[0]:
+                    raise ValueError("Stride is too large for input")
+                # Stride needs to get the chunk length here, it's going to get
+                # swallowed by the `feature_extractor` later, and then batching
+                # can add extra data in the inputs, so we need to keep track
+                # of the original length in the stride so we can cut properly.
+                stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+            # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
+            # Currently chunking is not possible at this level for `seq2seq` so
+            # it's ok.
+            align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+            chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+            for item in chunk_iter(
+                    inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
+            ):
+                item["audio_array"] = inputs
+                yield item
+        else:
+            if inputs.shape[0] > self.feature_extractor.n_samples:
+                processed = self.feature_extractor(
+                    inputs,
+                    sampling_rate=self.feature_extractor.sampling_rate,
+                    truncation=False,
+                    padding="longest",
+                    return_tensors="pt",
+                )
+            else:
+                processed = self.feature_extractor(
+                    inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+                )
+            if self.torch_dtype is not None:
+                processed = processed.to(dtype=self.torch_dtype)
+            if stride is not None:
+                processed["stride"] = stride
+            yield {"is_last": True, "audio_array": inputs, **processed, **extra}
+    def _forward(self, model_inputs, **generate_kwargs):
+        attention_mask = model_inputs.pop("attention_mask", None)
+        stride = model_inputs.pop("stride", None)
+        is_last = model_inputs.pop("is_last")
+        audio_array = model_inputs.pop("audio_array")
+        encoder = self.model.get_encoder()
+        # Consume values so we can let extra information flow freely through
+        # the pipeline (important for `partial` in microphone)
+        if "input_features" in model_inputs:
+            inputs = model_inputs.pop("input_features")
+        elif "input_values" in model_inputs:
+            inputs = model_inputs.pop("input_values")
+        else:
+            raise ValueError(
+                "Seq2Seq speech recognition model requires either a "
+                f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
+            )
+        # custom processing for Whisper timestamps and word-level timestamps
+        generate_kwargs["return_timestamps"] = True
+        if inputs.shape[-1] > self.feature_extractor.nb_max_frames:
+            generate_kwargs["input_features"] = inputs
+        else:
+            generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
+        tokens = self.model.generate(attention_mask=attention_mask, **generate_kwargs)
+        # whisper longform generation stores timestamps in "segments"
+        out = {"tokens": tokens}
+        if self.type == "seq2seq_whisper":
+            if stride is not None:
+                out["stride"] = stride
+        # Leftover
+        extra = model_inputs
+        return {"is_last": is_last, "audio_array": audio_array, **out, **extra}
+    def postprocess(self,
+                    model_outputs,
+                    decoder_kwargs: Optional[Dict] = None,
+                    return_language=None,
+                    *args,
+                    **kwargs):
+        assert len(model_outputs) > 0
+        audio_array = list(model_outputs)[0]["audio_array"]
+        sd = self.model_speaker_diarization(audio_array, sampling_rate=self.feature_extractor.sampling_rate)
+        timelines = sd.get_timeline()
+        outputs = super().postprocess(
+            model_outputs=model_outputs,
+            decoder_kwargs=decoder_kwargs,
+            return_timestamps=True,
+            return_language=return_language
+        )
+        pointer_ts = 0
+        pointer_chunk = 0
+        new_chunks = []
+        while True:
+            if pointer_ts == len(timelines):
+                ts = timelines[-1]
+                for chunk in outputs["chunks"][pointer_chunk:]:
+                    chunk["speaker"] = sd.get_labels(ts)
+                    new_chunks.append(chunk)
+                break
+            if pointer_chunk == len(outputs["chunks"]):
+                break
+            ts = timelines[pointer_ts]
+            chunk = outputs["chunks"][pointer_chunk]
+            if "speaker" not in chunk:
+                chunk["speaker"] = []
+            start, end = chunk["timestamp"]
+            if ts.end <= start:
+                pointer_ts += 1
+            elif end <= ts.start:
+                if len(chunk["speaker"]) == 0:
+                    chunk["speaker"] += list(sd.get_labels(ts))
+                new_chunks.append(chunk)
+                pointer_chunk += 1
+            else:
+                chunk["speaker"] += list(sd.get_labels(ts))
+                if ts.end >= end:
+                    new_chunks.append(chunk)
+                    pointer_chunk += 1
+                else:
+                    pointer_ts += 1
+        for i in new_chunks:
+            if "speaker" in i:
+                if self.return_unique_speaker:
+                    i["speaker"] = [i["speaker"][0]]
+                else:
+                    i["speaker"] = list(set(i["speaker"]))
+            else:
+                i["speaker"] = []
+        outputs["chunks"] = new_chunks
+        if self.punctuator:
+            outputs["chunks"] = self.punctuator.punctuate(outputs["chunks"])
+        outputs["text"] = "".join([c["text"] for c in outputs["chunks"]])
+        outputs["speakers"] = sd.labels()
+        outputs.pop("audio_array")
+        for s in outputs["speakers"]:
+            outputs[f"text/{s}"] = "".join([c["text"] for c in outputs["chunks"] if s in c["speaker"]])
+            outputs[f"chunks/{s}"] = [c for c in outputs["chunks"] if s in c["speaker"]]
+        return outputs

pipeline/push_pipeline.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pprint import pprint
+from kotoba_whisper import KotobaWhisperPipeline
+from transformers.pipelines import PIPELINE_REGISTRY, pipeline
+from transformers import WhisperForConditionalGeneration, TFWhisperForConditionalGeneration
+model_alias = "kotoba-tech/kotoba-whisper-v2.2"
+PIPELINE_REGISTRY.register_pipeline(
+    "kotoba-whisper",
+    pipeline_class=KotobaWhisperPipeline,
+    pt_model=WhisperForConditionalGeneration,
+    tf_model=TFWhisperForConditionalGeneration
+)
+test_audio = "/Users/asahiu/Desktop/speaker_diariazation_sample_1.wav"
+pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16, return_unique_speaker=True)
+output = pipe(test_audio)
+pprint(output)
+pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
+output = pipe(test_audio)
+pprint(output)
+pipe.push_to_hub(model_alias)

pipeline/test_pipeline.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from pprint import pprint
+from transformers.pipelines import pipeline
+test_audio = "/Users/asahiu/Desktop/speaker_diariazation_sample_1.wav"
+pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=15, batch_size=16, trust_remote_code=True)
+output = pipe(test_audio)
+pprint(output)

pipeline/test_speaker_diarization.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Setup:
+#     pip install pyannote.audio>=3.1
+# Requirement: Sumit access request for the following models.
+#     https://huggingface.co/pyannote/speaker-diarization-3.1
+#     https://huggingface.co/pyannote/segmentation-3.0
+import soundfile as sf
+import numpy as np
+from typing import Union, Optional, Dict, List
+import torch
+from pyannote.audio import Pipeline
+class SpeakerDiarization:
+    def __init__(self, model_id: str):
+        self.pipeline = Pipeline.from_pretrained(model_id)
+    def __call__(self,
+                 audio: Union[str, torch.Tensor, np.ndarray],
+                 sampling_rate: Optional[int] = None) -> Dict[str, List[List[float]]]:
+        if type(audio) is torch.Tensor or type(audio) is np.ndarray:
+            if sampling_rate is None:
+                raise ValueError("sampling_rate must be provided")
+            if type(audio) is np.ndarray:
+                audio = torch.as_tensor(audio)
+            audio = torch.as_tensor(audio, dtype=torch.float32)
+            if len(audio.shape) == 1:
+                audio = audio.unsqueeze(0)
+            elif len(audio.shape) > 3:
+                raise ValueError("audio shape must be (channel, time)")
+            audio = {"waveform": audio, "sample_rate": sampling_rate}
+        output = self.pipeline(audio)
+        # dictionary: {speaker_id: [[start, end],...]}
+        return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()}
+pipeline = SpeakerDiarization("pyannote/speaker-diarization-3.1")
+root_dir = "/Users/asahiu/Desktop"
+sample_audio_files = ["speaker_diariazation_sample_1.wav", "speaker_diariazation_sample_2.wav"]
+print(sample_audio_file)
+a, sr = sf.read(f"{root_dir}/{sample_audio_file}")
+output = pipeline(a, sampling_rate=sr)
+print(output)
+output = pipeline(f"{root_dir}/{sample_audio_file}")
+print(output)
+print()

sample_audio/sample_diarization_japanese.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7252359b53264c767da33a48e39ff57a8f31641c4a80a1702c6940f8914697b
+size 780064