so-vits-svc-models-pcr

Runtime error

App Files Files Community

FrankZxShen commited on May 30, 2023

Commit

8913fdd

•

1 Parent(s): ce252ec

init

Browse files

Files changed (28) hide show

.gitattributes +1 -0
pretrain/checkpoint_best_legacy_500.pt +3 -0
pretrain/meta.py +31 -0
pretrain/nsf_hifigan/NOTICE.txt +74 -0
pretrain/nsf_hifigan/NOTICE.zh-CN.txt +72 -0
pretrain/nsf_hifigan/config.json +38 -0
pretrain/nsf_hifigan/model +3 -0
pretrain/put_hubert_ckpt_here +0 -0
vencoder/ContentVec256L12_Onnx.py +28 -0
vencoder/ContentVec256L9.py +35 -0
vencoder/ContentVec256L9_Onnx.py +28 -0
vencoder/ContentVec768L12.py +34 -0
vencoder/ContentVec768L12_Onnx.py +28 -0
vencoder/ContentVec768L9_Onnx.py +28 -0
vencoder/HubertSoft.py +23 -0
vencoder/HubertSoft_Onnx.py +28 -0
vencoder/WhisperPPG.py +30 -0
vencoder/__init__.py +0 -0
vencoder/encoder.py +12 -0
vencoder/hubert/__init__.py +0 -0
vencoder/hubert/hubert_model.py +222 -0
vencoder/hubert/hubert_model_onnx.py +217 -0
vencoder/whisper/__init__.py +0 -0
vencoder/whisper/audio.py +125 -0
vencoder/whisper/decoding.py +712 -0
vencoder/whisper/model.py +269 -0
vencoder/whisper/tokenizer.py +331 -0
vencoder/whisper/utils.py +163 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pretrain/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text

pretrain/checkpoint_best_legacy_500.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60d936ec5a566776fc392e69ad8b630d14eb588111233fe313436e200a7b187b
+size 1330114945

pretrain/meta.py ADDED Viewed

	@@ -0,0 +1,31 @@

+def download_dict():
+    return {
+        "vec768l12": {
+            "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
+            "output": "./pretrain/checkpoint_best_legacy_500.pt"
+        },
+        "vec256l9": {
+            "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
+            "output": "./pretrain/checkpoint_best_legacy_500.pt"
+        },
+        "hubertsoft": {
+            "url": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
+            "output": "./pretrain/hubert-soft-0d54a1f4.pt"
+        },
+        "whisper-ppg": {
+            "url": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
+            "output": "./pretrain/medium.pt"
+        }
+    }
+def get_speech_encoder(config_path="configs/config.json"):
+    import json
+    with open(config_path, "r") as f:
+        data = f.read()
+        config = json.loads(data)
+        speech_encoder = config["model"]["speech_encoder"]
+        dict = download_dict()
+        return dict[speech_encoder]["url"], dict[speech_encoder]["output"]

pretrain/nsf_hifigan/NOTICE.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+--- DiffSinger Community Vocoder ---
+ARCHITECTURE: NSF-HiFiGAN
+RELEASE DATE: 2022-12-11
+HYPER PARAMETERS:
+ - 44100 sample rate
+ - 128 mel bins
+ - 512 hop size
+ - 2048 window size
+ - fmin at 40Hz
+ - fmax at 16000Hz
+NOTICE:
+All model weights in the [DiffSinger Community Vocoder Project](https://openvpi.github.io/vocoders/), including
+model weights in this directory, are provided by the [OpenVPI Team](https://github.com/openvpi/), under the
+[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
+ACKNOWLEDGEMENTS:
+Training data of this vocoder is provided and permitted by the following organizations, societies and individuals:
+孙飒              https://www.qfssr.cn
+赤松_Akamatsu     https://www.zhibin.club
+乐威              https://www.zhibin.club
+伯添              https://space.bilibili.com/24087011
+雲宇光             https://space.bilibili.com/660675050
+橙子言             https://space.bilibili.com/318486464
+人衣大人           https://space.bilibili.com/2270344
+玖蝶              https://space.bilibili.com/676771003
+Yuuko
+白夜零BYL          https://space.bilibili.com/1605040503
+嗷天              https://space.bilibili.com/5675252
+洛泠羽            https://space.bilibili.com/347373318
+灰条纹的灰猫君      https://space.bilibili.com/2083633
+幽寂              https://space.bilibili.com/478860
+恶魔王女           https://space.bilibili.com/2475098
+AlexYHX 芮晴
+绮萱              https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
+诗芸              https://y.qq.com/n/ryqq/singer/0005NInj142zm0
+汐蕾              https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
+1262917464
+炜阳
+叶卡yolka
+幸の夏            https://space.bilibili.com/1017297686
+暮色未量           https://space.bilibili.com/272904686
+晓寞sama          https://space.bilibili.com/3463394
+没头绪的节操君
+串串BunC          https://space.bilibili.com/95817834
+落雨              https://space.bilibili.com/1292427
+长尾巴的翎艾        https://space.bilibili.com/1638666
+声闻计划           https://space.bilibili.com/392812269
+唐家大小姐         http://5sing.kugou.com/palmusic/default.html
+不伊子
+Training machines are provided by:
+花儿不哭           https://space.bilibili.com/5760446
+TERMS OF REDISTRIBUTIONS:
+1. Do not sell this vocoder, or charge any fees from redistributing it, as prohibited by
+   the license.
+2. Include a copy of the CC BY-NC-SA 4.0 license, or a link referring to it.
+3. Include a copy of this notice, or any other notices informing that this vocoder is
+   provided by the OpenVPI Team, that this vocoder is licensed under CC BY-NC-SA 4.0, and
+   with a complete acknowledgement list as shown above.
+4. If you fine-tuned or modified the weights, leave a notice about what has been changed.
+5. (Optional) Leave a link to the official release page of the vocoder, and tell users
+   that other versions and future updates of this vocoder can be obtained from the website.

pretrain/nsf_hifigan/NOTICE.zh-CN.txt ADDED Viewed

	@@ -0,0 +1,72 @@

+--- DiffSinger 社区声码器 ---
+架构：NSF-HiFiGAN
+发布日期：2022-12-11
+超参数：
+ - 44100 sample rate
+ - 128 mel bins
+ - 512 hop size
+ - 2048 window size
+ - fmin at 40Hz
+ - fmax at 16000Hz
+注意事项：
+[DiffSinger 社区声码器企划](https://openvpi.github.io/vocoders/) 中的所有模型权重，
+包括此目录下的模型权重，均由 [OpenVPI Team](https://github.com/openvpi/) 提供，并基于
+[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/)
+进行许可。
+致谢：
+此声码器的训练数据由以下组织、社团和个人提供并许可：
+孙飒              https://www.qfssr.cn
+赤松_Akamatsu     https://www.zhibin.club
+乐威              https://www.zhibin.club
+伯添              https://space.bilibili.com/24087011
+雲宇光             https://space.bilibili.com/660675050
+橙子言             https://space.bilibili.com/318486464
+人衣大人           https://space.bilibili.com/2270344
+玖蝶              https://space.bilibili.com/676771003
+Yuuko
+白夜零BYL          https://space.bilibili.com/1605040503
+嗷天              https://space.bilibili.com/5675252
+洛泠羽            https://space.bilibili.com/347373318
+灰条纹的灰猫君      https://space.bilibili.com/2083633
+幽寂              https://space.bilibili.com/478860
+恶魔王女           https://space.bilibili.com/2475098
+AlexYHX 芮晴
+绮萱              https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
+诗芸              https://y.qq.com/n/ryqq/singer/0005NInj142zm0
+汐蕾              https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
+1262917464
+炜阳
+叶卡yolka
+幸の夏            https://space.bilibili.com/1017297686
+暮色未量           https://space.bilibili.com/272904686
+晓寞sama          https://space.bilibili.com/3463394
+没头绪的节操君
+串串BunC          https://space.bilibili.com/95817834
+落雨              https://space.bilibili.com/1292427
+长尾巴的翎艾        https://space.bilibili.com/1638666
+声闻计划           https://space.bilibili.com/392812269
+唐家大小姐         http://5sing.kugou.com/palmusic/default.html
+不伊子
+训练算力的提供者如下：
+花儿不哭           https://space.bilibili.com/5760446
+二次分发条款：
+1. 请勿售卖此声码器或从其二次分发过程中收取任何费用，因为此类行为受到许可证的禁止。
+2. 请在二次分发文件中包含一份 CC BY-NC-SA 4.0 许可证的副本或指向该许可证的链接。
+3. 请在二次分发文件中包含这份声明，或以其他形式声明此声码器由 OpenVPI Team 提供并基于 CC BY-NC-SA 4.0 许可，
+   并附带上述完整的致谢名单。
+4. 如果您微调或修改了权重，请留下一份关于其受到了何种修改的说明。
+5.（可选）留下一份指向此声码器的官方发布页面的链接，并告知使用者可从该网站获取此声码器的其他版本和未来的更新。

pretrain/nsf_hifigan/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "resblock": "1",
+    "num_gpus": 4,
+    "batch_size": 10,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates":        [ 8, 8, 2, 2, 2],
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
+    "segment_size": 16384,
+    "num_mels": 128,
+    "num_freq": 1025,
+    "n_fft"   : 2048,
+    "hop_size": 512,
+    "win_size": 2048,
+    "sampling_rate": 44100,
+    "fmin": 40,
+    "fmax": 16000,
+    "fmax_for_loss": null,
+    "num_workers": 16,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

pretrain/nsf_hifigan/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c576b63b7ed952161b70fad34e0562ace502ce689195520d8a2a6c051de29d6
+size 56825430

pretrain/put_hubert_ckpt_here ADDED Viewed

File without changes

vencoder/ContentVec256L12_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec256L12_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-256-layer-12.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec256L9.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from fairseq import checkpoint_utils
+class ContentVec256L9(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+          [vec_path],
+          suffix="",
+        )
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.model = models[0].to(self.dev)
+        self.model.eval()
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+        inputs = {
+          "source": feats.to(wav.device),
+          "padding_mask": padding_mask.to(wav.device),
+          "output_layer": 9,  # layer 9
+        }
+        with torch.no_grad():
+          logits = self.model.extract_features(**inputs)
+          feats = self.model.final_proj(logits[0])
+        return feats.transpose(1, 2)

vencoder/ContentVec256L9_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec256L9_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-256-layer-9.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec768L12.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from fairseq import checkpoint_utils
+class ContentVec768L12(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+          [vec_path],
+          suffix="",
+        )
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.model = models[0].to(self.dev)
+        self.model.eval()
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+        inputs = {
+          "source": feats.to(wav.device),
+          "padding_mask": padding_mask.to(wav.device),
+          "output_layer": 12,  # layer 12
+        }
+        with torch.no_grad():
+          logits = self.model.extract_features(**inputs)
+        return logits[0].transpose(1, 2)

vencoder/ContentVec768L12_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec768L12_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-768-layer-12.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec768L9_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec768L9_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/HubertSoft.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.hubert import hubert_model
+class HubertSoft(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        hubert_soft = hubert_model.hubert_soft(vec_path)
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.hidden_dim = 256
+        self.model = hubert_soft.to(self.dev)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats[None,None,:]
+        with torch.inference_mode():
+          units = self.model.units(feats)
+          return units.transpose(1,2)

vencoder/HubertSoft_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class HubertSoft_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/hubert-soft.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/WhisperPPG.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.whisper.model import Whisper, ModelDimensions
+from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
+class WhisperPPG(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/medium.pt",device=None):
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        checkpoint = torch.load(vec_path, map_location=device)
+        dims = ModelDimensions(**checkpoint["dims"])
+        model = Whisper(dims)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.hidden_dim = dims
+        self.model = model.to(self.dev)
+    def encoder(self, wav):
+        audio = wav
+        audln = audio.shape[0]
+        ppgln = audln // 320
+        audio = pad_or_trim(audio)
+        mel = log_mel_spectrogram(audio).to(self.dev)
+        with torch.no_grad():
+            ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+            ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
+            return ppg[None,:,:].transpose(1, 2)

vencoder/__init__.py ADDED Viewed

File without changes

vencoder/encoder.py ADDED Viewed

	@@ -0,0 +1,12 @@

+class SpeechEncoder(object):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        self.model = None  #This is Model
+        self.hidden_dim = 768
+        pass
+    def encoder(self,wav):
+        '''
+        input: wav:[batchsize,signal_length]
+        output: embedding:[batchsize,wav_frame,hidden_dim]
+        '''
+        pass

vencoder/hubert/__init__.py ADDED Viewed

File without changes

vencoder/hubert/hubert_model.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import copy
+import random
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert

vencoder/hubert/hubert_model_onnx.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import copy
+import random
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+    def forward(self, x):
+        return self.units(x)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert

vencoder/whisper/__init__.py ADDED Viewed

File without changes

vencoder/whisper/audio.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+from functools import lru_cache
+from typing import Union
+import ffmpeg
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .utils import exact_div
+from librosa.filters import mel as librosa_mel_fn
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(dim=axis, index=torch.arange(length, device=array.device))
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device)
+def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

vencoder/whisper/decoding.py ADDED Viewed

	@@ -0,0 +1,712 @@

+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributions import Categorical
+from .audio import CHUNK_LENGTH
+from .tokenizer import Tokenizer, get_tokenizer
+from .utils import compression_ratio
+if TYPE_CHECKING:
+    from .model import Whisper
+@torch.no_grad()
+def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
+    """
+    Detect the spoken language in the audio, and return them as list of strings, along with the ids
+    of the most probable language tokens and the probability distribution over all language tokens.
+    This is performed outside the main decode loop in order to not interfere with kv-caching.
+    Returns
+    -------
+    language_tokens : Tensor, shape = (n_audio,)
+        ids of the most probable language tokens, which appears after the startoftranscript token.
+    language_probs : List[Dict[str, float]], length = n_audio
+        list of dictionaries containing the probability distribution over all languages.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer(model.is_multilingual)
+    if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
+        raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    # skip encoder forward pass if already-encoded audio features were given
+    if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
+        mel = model.encoder(mel)
+    # forward pass using a single token, startoftranscript
+    n_audio = mel.shape[0]
+    x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
+    logits = model.logits(x, mel)[:, 0]
+    # collect detected languages; suppress all non-language tokens
+    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
+    mask[list(tokenizer.all_language_tokens)] = False
+    logits[:, mask] = -np.inf
+    language_tokens = logits.argmax(dim=-1)
+    language_token_probs = logits.softmax(dim=-1).cpu()
+    language_probs = [
+        {
+            c: language_token_probs[i, j].item()
+            for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
+        }
+        for i in range(n_audio)
+    ]
+    if single:
+        language_tokens = language_tokens[0]
+        language_probs = language_probs[0]
+    return language_tokens, language_probs
+@dataclass(frozen=True)
+class DecodingOptions:
+    task: str = "transcribe"  # whether to perform X->X "transcribe" or X->English "translate"
+    language: Optional[str] = None  # language that the audio is in; uses detected language if None
+    # sampling-related options
+    temperature: float = 0.0
+    sample_len: Optional[int] = None  # maximum number of tokens to sample
+    best_of: Optional[int] = None     # number of independent samples to collect, when t > 0
+    beam_size: Optional[int] = None   # number of beams in beam search, when t == 0
+    patience: Optional[float] = None  # patience in beam search (https://arxiv.org/abs/2204.05424)
+    # options for ranking generations (either beams or best-of-N samples)
+    length_penalty: Optional[float] = None   # "alpha" in Google NMT, None defaults to length norm
+    # prompt, prefix, and token suppression
+    prompt: Optional[Union[str, List[int]]] = None   # text or tokens for the previous context
+    prefix: Optional[Union[str, List[int]]] = None   # text or tokens to prefix the current context
+    suppress_blank: bool = True                      # this will suppress blank outputs
+    # list of tokens ids (or comma-separated token ids) to suppress
+    # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
+    suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
+    # timestamp sampling options
+    without_timestamps: bool = False              # use <|notimestamps|> to sample text tokens only
+    max_initial_timestamp: Optional[float] = 1.0  # the initial timestamp cannot be later than this
+    # implementation details
+    fp16: bool = True  # use fp16 for most of the calculation
+@dataclass(frozen=True)
+class DecodingResult:
+    audio_features: Tensor
+    language: str
+    language_probs: Optional[Dict[str, float]] = None
+    tokens: List[int] = field(default_factory=list)
+    text: str = ""
+    avg_logprob: float = np.nan
+    no_speech_prob: float = np.nan
+    temperature: float = np.nan
+    compression_ratio: float = np.nan
+class Inference:
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        """Perform a forward pass on the decoder and return per-token logits"""
+        raise NotImplementedError
+    def rearrange_kv_cache(self, source_indices) -> None:
+        """Update the key-value cache according to the updated beams"""
+        raise NotImplementedError
+    def cleanup_caching(self) -> None:
+        """Clean up any resources or hooks after decoding is finished"""
+        pass
+class PyTorchInference(Inference):
+    def __init__(self, model: "Whisper", initial_token_length: int):
+        self.model: "Whisper" = model
+        self.initial_token_length = initial_token_length
+        self.kv_cache = {}
+        self.hooks = []
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        if not self.kv_cache:
+            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
+        if tokens.shape[-1] > self.initial_token_length:
+            # only need to use the last token except in the first forward pass
+            tokens = tokens[:, -1:]
+        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
+    def cleanup_caching(self):
+        for hook in self.hooks:
+            hook.remove()
+        self.kv_cache = {}
+        self.hooks = []
+    def rearrange_kv_cache(self, source_indices):
+        for module, tensor in self.kv_cache.items():
+            # update the key/value cache to contain the selected sequences
+            self.kv_cache[module] = tensor[source_indices].detach()
+class SequenceRanker:
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
+        """
+        Given a list of groups of samples and their cumulative log probabilities,
+        return the indices of the samples in each group to select as the final result
+        """
+        raise NotImplementedError
+class MaximumLikelihoodRanker(SequenceRanker):
+    """
+    Select the sample with the highest log probabilities, penalized using either
+    a simple length normalization or Google NMT paper's length penalty
+    """
+    def __init__(self, length_penalty: Optional[float]):
+        self.length_penalty = length_penalty
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
+        def scores(logprobs, lengths):
+            result = []
+            for logprob, length in zip(logprobs, lengths):
+                if self.length_penalty is None:
+                    penalty = length
+                else:
+                    # from the Google NMT paper
+                    penalty = ((5 + length) / 6) ** self.length_penalty
+                result.append(logprob / penalty)
+            return result
+        # get the sequence with the highest score
+        lengths = [[len(t) for t in s] for s in tokens]
+        return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
+class TokenDecoder:
+    def reset(self):
+        """Initialize any stateful variables for decoding a new sequence"""
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        """Specify how to select the next token, based on the current trace and logits
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        sum_logprobs : Tensor, shape = (n_batch)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
+            the tokens, appended with the selected next token
+        completed : bool
+            True if all sequences has reached the end of text
+        """
+        raise NotImplementedError
+    def finalize(
+        self, tokens: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
+        """Finalize search and return the final candidate sequences
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence
+        sum_logprobs : Tensor, shape = (n_audio, n_group)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Sequence[Sequence[Tensor]], length = n_audio
+            sequence of Tensors containing candidate token sequences, for each audio input
+        sum_logprobs : List[List[float]], length = n_audio
+            sequence of cumulative log probabilities corresponding to the above
+        """
+        raise NotImplementedError
+class GreedyDecoder(TokenDecoder):
+    def __init__(self, temperature: float, eot: int):
+        self.temperature = temperature
+        self.eot = eot
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        temperature = self.temperature
+        if temperature == 0:
+            next_tokens = logits.argmax(dim=-1)
+        else:
+            next_tokens = Categorical(logits=logits / temperature).sample()
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
+        sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
+        next_tokens[tokens[:, -1] == self.eot] = self.eot
+        tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
+        completed = (tokens[:, -1] == self.eot).all()
+        return tokens, completed
+    def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
+        # make sure each sequence has at least one EOT token at the end
+        tokens = F.pad(tokens, (0, 1), value=self.eot)
+        return tokens, sum_logprobs.tolist()
+class BeamSearchDecoder(TokenDecoder):
+    def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
+        self.beam_size = beam_size
+        self.eot = eot
+        self.inference = inference
+        self.patience = patience or 1.0
+        self.max_candidates: int = round(beam_size * self.patience)
+        self.finished_sequences = None
+        assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
+    def reset(self):
+        self.finished_sequences = None
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        if tokens.shape[0] % self.beam_size != 0:
+            raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
+        n_audio = tokens.shape[0] // self.beam_size
+        if self.finished_sequences is None:  # for the first update
+            self.finished_sequences = [{} for _ in range(n_audio)]
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        next_tokens, source_indices, finished_sequences = [], [], []
+        for i in range(n_audio):
+            scores, sources, finished = {}, {}, {}
+            # STEP 1: calculate the cumulative log probabilities for possible candidates
+            for j in range(self.beam_size):
+                idx = i * self.beam_size + j
+                prefix = tokens[idx].tolist()
+                for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
+                    new_logprob = (sum_logprobs[idx] + logprob).item()
+                    sequence = tuple(prefix + [token.item()])
+                    scores[sequence] = new_logprob
+                    sources[sequence] = idx
+            # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
+            saved = 0
+            for sequence in sorted(scores, key=scores.get, reverse=True):
+                if sequence[-1] == self.eot:
+                    finished[sequence] = scores[sequence]
+                else:
+                    sum_logprobs[len(next_tokens)] = scores[sequence]
+                    next_tokens.append(sequence)
+                    source_indices.append(sources[sequence])
+                    saved += 1
+                    if saved == self.beam_size:
+                        break
+            finished_sequences.append(finished)
+        tokens = torch.tensor(next_tokens, device=tokens.device)
+        self.inference.rearrange_kv_cache(source_indices)
+        # add newly finished sequences to self.finished_sequences
+        assert len(self.finished_sequences) == len(finished_sequences)
+        for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
+            for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
+                if len(previously_finished) >= self.max_candidates:
+                    break  # the candidate list is full
+                previously_finished[seq] = newly_finished[seq]
+        # mark as completed if all audio has enough number of samples
+        completed = all(
+            len(sequences) >= self.max_candidates for sequences in self.finished_sequences
+        )
+        return tokens, completed
+    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
+        # collect all finished sequences, including patience, and add unfinished ones if not enough
+        sum_logprobs = sum_logprobs.cpu()
+        for i, sequences in enumerate(self.finished_sequences):
+            if len(sequences) < self.beam_size:  # when not enough sequences are finished
+                for j in list(np.argsort(sum_logprobs[i]))[::-1]:
+                    sequence = preceding_tokens[i, j].tolist() + [self.eot]
+                    sequences[tuple(sequence)] = sum_logprobs[i][j].item()
+                    if len(sequences) >= self.beam_size:
+                        break
+        tokens: List[List[Tensor]] = [
+            [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
+        ]
+        sum_logprobs: List[List[float]] = [
+            list(sequences.values()) for sequences in self.finished_sequences
+        ]
+        return tokens, sum_logprobs
+class LogitFilter:
+    def apply(self, logits: Tensor, tokens: Tensor) -> None:
+        """Apply any filtering or masking to logits in-place
+        Parameters
+        ----------
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        """
+        raise NotImplementedError
+class SuppressBlank(LogitFilter):
+    def __init__(self, tokenizer: Tokenizer, sample_begin: int):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+    def apply(self, logits: Tensor, tokens: Tensor):
+        if tokens.shape[1] == self.sample_begin:
+            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+class SuppressTokens(LogitFilter):
+    def __init__(self, suppress_tokens: Sequence[int]):
+        self.suppress_tokens = list(suppress_tokens)
+    def apply(self, logits: Tensor, tokens: Tensor):
+        logits[:, self.suppress_tokens] = -np.inf
+class ApplyTimestampRules(LogitFilter):
+    def __init__(
+        self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
+    ):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+        self.max_initial_timestamp_index = max_initial_timestamp_index
+    def apply(self, logits: Tensor, tokens: Tensor):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        if self.tokenizer.no_timestamps is not None:
+            logits[:, self.tokenizer.no_timestamps] = -np.inf
+        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        for k in range(tokens.shape[0]):
+            seq = [t for t in tokens[k, self.sample_begin :].tolist()]
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
+                else:  # cannot be normal text tokens
+                    logits[k, : self.tokenizer.eot] = -np.inf
+        if tokens.shape[1] == self.sample_begin:
+            # suppress generating non-timestamp tokens at the beginning
+            logits[:, : self.tokenizer.timestamp_begin] = -np.inf
+            # apply the `max_initial_timestamp` option
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
+                logits[:, last_allowed + 1 :] = -np.inf
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        for k in range(tokens.shape[0]):
+            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
+class DecodingTask:
+    inference: Inference
+    sequence_ranker: SequenceRanker
+    decoder: TokenDecoder
+    logit_filters: List[LogitFilter]
+    def __init__(self, model: "Whisper", options: DecodingOptions):
+        self.model = model
+        language = options.language or "en"
+        tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
+        self.tokenizer: Tokenizer = tokenizer
+        self.options: DecodingOptions = self._verify_options(options)
+        self.n_group: int = options.beam_size or options.best_of or 1
+        self.n_ctx: int = model.dims.n_text_ctx
+        self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
+        self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
+        if self.options.without_timestamps:
+            self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
+        self.initial_tokens: Tuple[int] = self._get_initial_tokens()
+        self.sample_begin: int = len(self.initial_tokens)
+        self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
+        # inference: implements the forward pass through the decoder, including kv caching
+        self.inference = PyTorchInference(model, len(self.initial_tokens))
+        # sequence ranker: implements how to rank a group of sampled sequences
+        self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
+        # decoder: implements how to select the next tokens, given the autoregressive distribution
+        if options.beam_size is not None:
+            self.decoder = BeamSearchDecoder(
+                options.beam_size, tokenizer.eot, self.inference, options.patience
+            )
+        else:
+            self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
+        # logit filters: applies various rules to suppress or penalize certain tokens
+        self.logit_filters = []
+        if self.options.suppress_blank:
+            self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
+        if self.options.suppress_tokens:
+            self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
+        if not options.without_timestamps:
+            precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
+            max_initial_timestamp_index = None
+            if options.max_initial_timestamp:
+                max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
+            self.logit_filters.append(
+                ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
+            )
+    def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
+        if options.beam_size is not None and options.best_of is not None:
+            raise ValueError("beam_size and best_of can't be given together")
+        if options.temperature == 0:
+            if options.best_of is not None:
+                raise ValueError("best_of with greedy sampling (T=0) is not compatible")
+        if options.patience is not None and options.beam_size is None:
+            raise ValueError("patience requires beam_size to be given")
+        if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
+            raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
+        return options
+    def _get_initial_tokens(self) -> Tuple[int]:
+        tokens = list(self.sot_sequence)
+        prefix = self.options.prefix
+        prompt = self.options.prompt
+        if prefix:
+            prefix_tokens = (
+                self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
+            )
+            if self.sample_len is not None:
+                max_prefix_len = self.n_ctx // 2 - self.sample_len
+                prefix_tokens = prefix_tokens[-max_prefix_len:]
+            tokens = tokens + prefix_tokens
+        if prompt:
+            prompt_tokens = (
+                self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
+            )
+            tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
+        return tuple(tokens)
+    def _get_suppress_tokens(self) -> Tuple[int]:
+        suppress_tokens = self.options.suppress_tokens
+        if isinstance(suppress_tokens, str):
+            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
+        if -1 in suppress_tokens:
+            suppress_tokens = [t for t in suppress_tokens if t >= 0]
+            suppress_tokens.extend(self.tokenizer.non_speech_tokens)
+        elif suppress_tokens is None or len(suppress_tokens) == 0:
+            suppress_tokens = []  # interpret empty string as an empty list
+        else:
+            assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
+        suppress_tokens.extend(
+            [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
+        )
+        if self.tokenizer.no_speech is not None:
+            # no-speech probability is collected separately
+            suppress_tokens.append(self.tokenizer.no_speech)
+        return tuple(sorted(set(suppress_tokens)))
+    def _get_audio_features(self, mel: Tensor):
+        if self.options.fp16:
+            mel = mel.half()
+        if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
+            # encoded audio features are given; skip audio encoding
+            print("encoded audio features are given; skip audio encoding")
+            audio_features = mel
+        else:
+            print(mel.shape)
+            print("===============================")
+            audio_features = self.model.encoder(mel)
+        if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
+            return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
+        return audio_features
+    def _detect_language(self, audio_features: Tensor, tokens: Tensor):
+        languages = [self.options.language] * audio_features.shape[0]
+        lang_probs = None
+        if self.options.language is None or self.options.task == "lang_id":
+            lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
+            languages = [max(probs, key=probs.get) for probs in lang_probs]
+            if self.options.language is None:
+                tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
+        return languages, lang_probs
+    def _main_loop(self, audio_features: Tensor, tokens: Tensor):
+        assert audio_features.shape[0] == tokens.shape[0]
+        n_batch = tokens.shape[0]
+        sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
+        no_speech_probs = [np.nan] * n_batch
+        try:
+            for i in range(self.sample_len):
+                logits = self.inference.logits(tokens, audio_features)
+                if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
+                    probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
+                    no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
+                # now we need to consider the logits at the last token only
+                logits = logits[:, -1]
+                # apply the logit filters, e.g. for suppressing or applying penalty to
+                for logit_filter in self.logit_filters:
+                    logit_filter.apply(logits, tokens)
+                # expand the tokens tensor with the selected next tokens
+                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+                if completed or tokens.shape[-1] > self.n_ctx:
+                    break
+        finally:
+            self.inference.cleanup_caching()
+        return tokens, sum_logprobs, no_speech_probs
+    @torch.no_grad()
+    def run(self, mel: Tensor) -> List[DecodingResult]:
+        self.decoder.reset()
+        tokenizer: Tokenizer = self.tokenizer
+        n_audio: int = mel.shape[0]
+        audio_features: Tensor = self._get_audio_features(mel)  # encoder forward pass
+        tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
+        # detect language if requested, overwriting the language token
+        languages, language_probs = self._detect_language(audio_features, tokens)
+        if self.options.task == "lang_id":
+            return [
+                DecodingResult(audio_features=features, language=language, language_probs=probs)
+                for features, language, probs in zip(audio_features, languages, language_probs)
+            ]
+        # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
+        audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
+        tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
+        # call the main sampling loop
+        tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
+        # reshape the tensors to have (n_audio, n_group) as the first two dimensions
+        audio_features = audio_features[:: self.n_group]
+        no_speech_probs = no_speech_probs[:: self.n_group]
+        assert audio_features.shape[0] == len(no_speech_probs) == n_audio
+        tokens = tokens.reshape(n_audio, self.n_group, -1)
+        sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
+        # get the final candidates for each group, and slice between the first sampled token and EOT
+        tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
+        tokens: List[List[Tensor]] = [
+            [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
+        ]
+        # select the top-ranked sample in each group
+        selected = self.sequence_ranker.rank(tokens, sum_logprobs)
+        tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
+        texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
+        sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
+        avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
+        fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
+        if len(set(map(len, fields))) != 1:
+            raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
+        return [
+            DecodingResult(
+                audio_features=features,
+                language=language,
+                tokens=tokens,
+                text=text,
+                avg_logprob=avg_logprob,
+                no_speech_prob=no_speech_prob,
+                temperature=self.options.temperature,
+                compression_ratio=compression_ratio(text),
+            )
+            for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
+        ]
+@torch.no_grad()
+def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
+    """
+    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
+    Parameters
+    ----------
+    model: Whisper
+        the Whisper model instance
+    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
+        A tensor containing the Mel spectrogram(s)
+    options: DecodingOptions
+        A dataclass that contains all necessary options for decoding 30-second segments
+    Returns
+    -------
+    result: Union[DecodingResult, List[DecodingResult]]
+        The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
+    """
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    result = DecodingTask(model, options).run(mel)
+    if single:
+        result = result[0]
+    return result

vencoder/whisper/model.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from dataclasses import dataclass
+from typing import Dict
+from typing import Iterable, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+from .decoding import detect_language as detect_language_function, decode as decode_function
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk.float()
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        len_x = x.shape[1]
+        len_e = self.positional_embedding.shape[0]
+        assert len_x <= len_e, "incorrect audio shape"
+        pos_e = self.positional_embedding[:len_x, :]
+        x = (x + pos_e).to(x.dtype)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
+        )
+        self.ln = LayerNorm(n_state)
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+        x = self.ln(x)
+        logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
+        return logits
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+        self.decoder = TextDecoder(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        )
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel)
+    def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
+        return self.decoder(tokens, audio_features)
+    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def is_multilingual(self):
+        return self.dims.n_vocab == 51865
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
+                cache[module] = output  # save as-is, for the first token or cross attention
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+    detect_language = detect_language_function
+    decode = decode_function

vencoder/whisper/tokenizer.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import os
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from transformers import GPT2TokenizerFast
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
+@dataclass(frozen=True)
+class Tokenizer:
+    """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
+    tokenizer: "GPT2TokenizerFast"
+    language: Optional[str]
+    sot_sequence: Tuple[int]
+    def encode(self, text, **kwargs):
+        return self.tokenizer.encode(text, **kwargs)
+    def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
+        return self.tokenizer.decode(token_ids, **kwargs)
+    def decode_with_timestamps(self, tokens) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
+        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        outputs = [[]]
+        for token in tokens:
+            if token >= self.timestamp_begin:
+                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+        return "".join(outputs)
+    @property
+    @lru_cache()
+    def eot(self) -> int:
+        return self.tokenizer.eos_token_id
+    @property
+    @lru_cache()
+    def sot(self) -> int:
+        return self._get_single_token_id("<|startoftranscript|>")
+    @property
+    @lru_cache()
+    def sot_lm(self) -> int:
+        return self._get_single_token_id("<|startoflm|>")
+    @property
+    @lru_cache()
+    def sot_prev(self) -> int:
+        return self._get_single_token_id("<|startofprev|>")
+    @property
+    @lru_cache()
+    def no_speech(self) -> int:
+        return self._get_single_token_id("<|nospeech|>")
+    @property
+    @lru_cache()
+    def no_timestamps(self) -> int:
+        return self._get_single_token_id("<|notimestamps|>")
+    @property
+    @lru_cache()
+    def timestamp_begin(self) -> int:
+        return self.tokenizer.all_special_ids[-1] + 1
+    @property
+    @lru_cache()
+    def language_token(self) -> int:
+        """Returns the token id corresponding to the value of the `language` field"""
+        if self.language is None:
+            raise ValueError(f"This tokenizer does not have language token configured")
+        additional_tokens = dict(
+            zip(
+                self.tokenizer.additional_special_tokens,
+                self.tokenizer.additional_special_tokens_ids,
+            )
+        )
+        candidate = f"<|{self.language}|>"
+        if candidate in additional_tokens:
+            return additional_tokens[candidate]
+        raise KeyError(f"Language {self.language} not found in tokenizer.")
+    @property
+    @lru_cache()
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in zip(
+            self.tokenizer.additional_special_tokens,
+            self.tokenizer.additional_special_tokens_ids,
+        ):
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+    @property
+    @lru_cache()
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
+    @property
+    @lru_cache()
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+    @property
+    @lru_cache()
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
+        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+        return tuple(sorted(result))
+    def _get_single_token_id(self, text) -> int:
+        tokens = self.tokenizer.encode(text)
+        assert len(tokens) == 1, f"{text} is not encoded as a single token"
+        return tokens[0]
+@lru_cache(maxsize=None)
+def build_tokenizer(name: str = "gpt2"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    path = os.path.join(os.path.dirname(__file__), "assets", name)
+    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+    specials = [
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+    ]
+    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+    return tokenizer
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+    language: Optional[str] = None,
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        tokenizer_name = "multilingual"
+        task = task or "transcribe"
+        language = language or "en"
+    else:
+        tokenizer_name = "gpt2"
+        task = None
+        language = None
+    tokenizer = build_tokenizer(name=tokenizer_name)
+    all_special_ids: List[int] = tokenizer.all_special_ids
+    sot: int = all_special_ids[1]
+    translate: int = all_special_ids[-6]
+    transcribe: int = all_special_ids[-5]
+    langs = tuple(LANGUAGES.keys())
+    sot_sequence = [sot]
+    if language is not None:
+        sot_sequence.append(sot + 1 + langs.index(language))
+    if task is not None:
+        sot_sequence.append(transcribe if task == "transcribe" else translate)
+    return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))

vencoder/whisper/utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import json
+import os
+import sys
+import zlib
+from typing import Callable, TextIO
+system_encoding = sys.getdefaultencoding()
+if system_encoding != "utf-8":
+    def make_safe(string):
+        # replaces any character not representable using the system default encoding with an '?',
+        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
+        return string.encode(system_encoding, errors="replace").decode(system_encoding)
+else:
+    def make_safe(string):
+        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
+        return string
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(self, result: dict, audio_path: str):
+        audio_basename = os.path.basename(audio_path)
+        output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f)
+    def write_result(self, result: dict, file: TextIO):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(self, result: dict, file: TextIO):
+        for segment in result["segments"]:
+            print(segment['text'].strip(), file=file, flush=True)
+class WriteVTT(ResultWriter):
+    extension: str = "vtt"
+    def write_result(self, result: dict, file: TextIO):
+        print("WEBVTT\n", file=file)
+        for segment in result["segments"]:
+            print(
+                f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+class WriteSRT(ResultWriter):
+    extension: str = "srt"
+    def write_result(self, result: dict, file: TextIO):
+        for i, segment in enumerate(result["segments"], start=1):
+            # write srt lines
+            print(
+                f"{i}\n"
+                f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
+                f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(self, result: dict, file: TextIO):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment['start']), file=file, end="\t")
+            print(round(1000 * segment['end']), file=file, end="\t")
+            print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(self, result: dict, file: TextIO):
+        json.dump(result, file)
+def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]:
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(result: dict, file: TextIO):
+            for writer in all_writers:
+                writer(result, file)
+        return write_all
+    return writers[output_format](output_dir)