whisper-pt-demo

Runtime error

App Files Files Community

jlondonobo commited on Dec 11, 2022

Commit

75d8ce0

•

1 Parent(s): 1be8515

🌟 convert to whisper

Browse files

Files changed (3) hide show

app.py +9 -11
hf_to_whisper.py +70 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,22 +1,20 @@
 import gradio as gr
 import pytube as pt
 import torch
-from huggingface_hub import model_info
-from transformers import pipeline
 MODEL_NAME = "jlondonobo/whisper-medium-pt" #this always needs to stay in line 8 :D sorry for the hackiness
 lang = "pt"
 device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=30,
-    device=device,
-)
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 def transcribe(microphone, file_upload):
     warn_output = ""
@@ -31,7 +29,7 @@ def transcribe(microphone, file_upload):
     file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
     return warn_output + text
@@ -51,7 +49,7 @@ def yt_transcribe(yt_url):
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
     return html_embed_str, text

 import gradio as gr
 import pytube as pt
 import torch
+import whisper
+from hf_to_whisper import write_whisper_model_to_memory
+import os
 MODEL_NAME = "jlondonobo/whisper-medium-pt" #this always needs to stay in line 8 :D sorry for the hackiness
 lang = "pt"
 device = 0 if torch.cuda.is_available() else "cpu"
+local_model_path = "whisper-pt.pt"
+if not os.path.exists(local_model_path):
+    write_whisper_model_to_memory(MODEL_NAME, local_model_path)
+model = whisper.load_model(local_model_path)
 def transcribe(microphone, file_upload):
     warn_output = ""
     file = microphone if microphone is not None else file_upload
+    text = model.transcribe(file)["text"]
     return warn_output + text
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
+    text = model.transcribe("audio.mp3", decode_options={"language": lang})["text"]
     return html_embed_str, text

hf_to_whisper.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Original script: bayartsogt-ya/whisper-multiple-hf-datasets
+from copy import deepcopy
+import torch
+from transformers import WhisperForConditionalGeneration
+WHISPER_MAPPING = {
+    "layers": "blocks",
+    "fc1": "mlp.0",
+    "fc2": "mlp.2",
+    "final_layer_norm": "mlp_ln",
+    "layers": "blocks",
+    ".self_attn.q_proj": ".attn.query",
+    ".self_attn.k_proj": ".attn.key",
+    ".self_attn.v_proj": ".attn.value",
+    ".self_attn_layer_norm": ".attn_ln",
+    ".self_attn.out_proj": ".attn.out",
+    ".encoder_attn.q_proj": ".cross_attn.query",
+    ".encoder_attn.k_proj": ".cross_attn.key",
+    ".encoder_attn.v_proj": ".cross_attn.value",
+    ".encoder_attn_layer_norm": ".cross_attn_ln",
+    ".encoder_attn.out_proj": ".cross_attn.out",
+    "decoder.layer_norm.": "decoder.ln.",
+    "encoder.layer_norm.": "encoder.ln_post.",
+    "embed_tokens": "token_embedding",
+    "encoder.embed_positions.weight": "encoder.positional_embedding",
+    "decoder.embed_positions.weight": "decoder.positional_embedding",
+    "layer_norm": "ln_post",
+}
+def rename_keys(s_dict):
+    keys = list(s_dict.keys())
+    for key in keys:
+        new_key = key
+        for k, v in WHISPER_MAPPING.items():
+            if k in key:
+                new_key = new_key.replace(k, v)
+        print(f"{key} -> {new_key}")
+        s_dict[new_key] = s_dict.pop(key)
+    return s_dict
+def write_whisper_model_to_memory(
+    hf_model_name_or_path: str,
+    whisper_state_path: str
+):
+    transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
+    config = transformer_model.config
+    # first build dims
+    dims = {
+        'n_mels': config.num_mel_bins,
+        'n_vocab': config.vocab_size,
+        'n_audio_ctx': config.max_source_positions,
+        'n_audio_state': config.d_model,
+        'n_audio_head': config.encoder_attention_heads,
+        'n_audio_layer': config.encoder_layers,
+        'n_text_ctx': config.max_target_positions,
+        'n_text_state': config.d_model,
+        'n_text_head': config.decoder_attention_heads,
+        'n_text_layer': config.decoder_layers
+    }
+    state_dict = deepcopy(transformer_model.model.state_dict())
+    state_dict = rename_keys(state_dict)
+    torch.save({"dims": dims, "model_state_dict": state_dict}, whisper_state_path)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 git+https://github.com/huggingface/transformers
 torch
 pytube

 git+https://github.com/huggingface/transformers
+git+https://github.com/openai/whisper.git
 torch
 pytube