Spaces:

ixxan
/

uyghur-speech-models

Running

Irpan commited on Dec 23, 2024

Commit

bef8623

1 Parent(s): ef107e3

asr

Files changed (3) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import asr
 # from tts import synthesize
@@ -24,31 +25,25 @@ mms_transcribe = gr.Interface(
     allow_flagging="never",
 )
-# mms_synthesize = gr.Interface(
-#     fn=synthesize,
-#     inputs=[
-#         gr.Text(label="Input text"),
-#     ],
-#     outputs=[
-#         gr.Audio(label="Generated Audio", type="numpy"),
-#         gr.Text(label="Filtered text after removing OOVs"),
-#     ],
-#     #examples=TTS_EXAMPLES,
-#     title="Text-to-speech",
-#     description=("Generate audio from input text."),
-#     allow_flagging="never",
-# )
 tabbed_interface = gr.TabbedInterface(
-    [mms_transcribe],
-    ["Speech-to-text"],
 )
-# tabbed_interface = gr.TabbedInterface(
-#     [mms_transcribe, mms_synthesize],
-#     ["Speech-to-text", "Text-to-speech"],
-# )
 with gr.Blocks() as demo:
     tabbed_interface.render()

 import gradio as gr
 import asr
+import tts
 # from tts import synthesize
     allow_flagging="never",
 )
+mms_synthesize = gr.Interface(
+    fn=tts.synthesize,
+    inputs=[
+        gr.Text(label="Input text"),
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio", type="numpy"),
+    ],
+    #examples=TTS_EXAMPLES,
+    title="Text-to-speech",
+    description=("Generate audio from input text."),
+    allow_flagging="never",
+)
 tabbed_interface = gr.TabbedInterface(
+    [mms_transcribe, mms_synthesize],
+    ["Speech-to-text", "Text-to-speech"],
 )
 with gr.Blocks() as demo:
     tabbed_interface.render()

asr.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 # Load processor and model
 models_info = {
-    "openai/whisper-small-uzbek": {
         "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
         "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
         "ctc_model": False
@@ -27,7 +27,7 @@ models_info = {
         "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
         "ctc_model": False
     },
-    "facebook/mms-1b-all": {
         "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
         "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
         "ctc_model": True

 # Load processor and model
 models_info = {
+    "OpenAI-Whisper-Uzbek": {
         "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
         "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
         "ctc_model": False
         "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
         "ctc_model": False
     },
+    "Meta-MMS": {
         "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
         "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
         "ctc_model": True

tts.py ADDED Viewed

+from transformers import VitsModel, AutoTokenizer
+import torch
+# Load processor and model
+models_info = {
+    "Meta-MMS": {
+        "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
+        "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
+    },
+}
+def synthesize(text, model_id):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    processor = models_info[model_id]["processor"]
+    model = models_info[model_id]["model"].to(device)
+    inputs = processor(text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output = model(**inputs).waveform
+    sampling_rate = 22050
+    return (output.cpu(), sampling_rate)