Spaces:

CVMX-jaca-tonos
/

Identificar-lenguas-y-frases

Sleeping

App Files Files Community

lucio commited on May 8, 2022

Commit

a1f131a

•

1 Parent(s): 8997b10

try using tabs

Browse files

Files changed (1) hide show

app.py +139 -60

app.py CHANGED Viewed

@@ -14,16 +14,100 @@ import librosa
 import torchaudio
 from speechbrain.pretrained import EncoderClassifier
 # initialize language ID model
 lang_classifier = EncoderClassifier.from_hparams(
     source="speechbrain/lang-id-commonlanguage_ecapa",
     savedir="pretrained_models/lang-id-commonlanguage_ecapa"
 )
-def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
-    return pipeline("automatic-speech-recognition", model=model_path)
-# download STT model
 model_info = {
     "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
     "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
@@ -32,7 +116,28 @@ model_info = {
     "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
 }
 STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
 def client(audio_data: np.array, sample_rate: int, default_lang: str):
@@ -52,37 +157,19 @@ def client(audio_data: np.array, sample_rate: int, default_lang: str):
     print(default_lang, text_lab)
     if text_lab == 'Spanish':
-        text_lab = 'español'
         asr_pipeline = STT_MODELS['español']
         result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
     else:
-        text_lab = default_lang
         ds = STT_MODELS[default_lang]
         result = ds.stt(coqui_audio)
     return f"{text_lab}: {result}"
-def load_coqui_models(language):
-    model_path, file_name = model_info.get(language, ("", ""))
-    if not exists(file_name):
-        print(f"Downloading {model_path}")
-        r = requests.get(model_path, allow_redirects=True)
-        with open(file_name, 'wb') as file:
-            file.write(r.content)
-    else:
-        print(f"Found {file_name}. Skipping download...")
-    return Model(file_name)
-for lang in ('mixteco', 'chatino', 'totonaco'):
-    STT_MODELS[lang] = load_coqui_models(lang)
 def stt(default_lang: str, audio: Tuple[int, np.array]):
     sample_rate, audio = audio
     use_scorer = False
@@ -107,43 +194,35 @@ def _convert_audio(audio_data: np.array, sample_rate: int):
     output_audio.seek(0)
     return output_audio
-iface = gr.Interface(
-    fn=stt,
-    inputs=[
-        gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
-        gr.inputs.Audio(type="numpy", label="Audio", optional=False),
-    ],
-    outputs=gr.outputs.Textbox(label="Output"),
-    title="Coqui STT de Chatino, Mixteco, y Totonaco",
-    theme="huggingface",
-    description="Prueba de identificar frases del español en grabaciones de una lengua indígena, y prover el texto de cada una",
-    examples=[["mixteco", "ejemplos/espanol1.wav"],
-            ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
-            ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
-            ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
-            ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
-            ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
-    article="La identificación de lenguas usa el modelo"
-                " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
-                " y aquí se supone que si la lengua no es español, debe ser la lengua principal del contexto."
-                "\n\n"
-                "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
-                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
-                " con [los datos recopilados por Hilaria Cruz y sys colaboradores](https://gorilla.linguistlist.org/code/ctp/)"
-                "\n\n"
-                "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
-                " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
-                " con [los datos recopilados por Rey Castillo, Jonathan Amith y sus colaboradores](https://www.openslr.org/89)."
-                " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
-                " \n\n"
-                "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
-                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
-                " con [los datos recopilados por Osbel López Francisco y Jonathan Amith](https://www.openslr.org/107)."
-                " \n\n"
-                "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/). "
-                " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
-)
-iface.launch()

 import torchaudio
 from speechbrain.pretrained import EncoderClassifier
+UI_STRINGS = {
+    "title": {
+        "es": "Reconocimiento de Dictado en Chatino, Mixteco, Totonaco y Español",
+        "en": "Speech recognition in Chatino, Mixtec, Totonac and Spanish",
+    },
+    "description": {
+        "es": "Una demo de identificar frases del español y de tres lenguas indígenas de México, y proveer el texto de cada una",
+        "en": "A demo of identifying phrases in Spanish and three Mexican indigenous languages, and providing transcripts of each",
+    },
+    "article": {
+        "es":  "La identificación de lenguas usa el modelo"
+                " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
+                " y aquí se supone que si la lengua no es español, debe ser la lengua indígena del contexto."
+                "\n\n"
+                "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
+                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
+                " con [los datos recopilados por Hilaria Cruz y sus colaboradores](https://gorilla.linguistlist.org/code/ctp/)."
+                "\n\n"
+                "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
+                " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
+                " con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
+                " \n\n"
+                "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
+                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
+                " con [los datos recopilados por Osbel López Francisco y sus colaboradores](https://www.openslr.org/107)."
+                " \n\n"
+                "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/) de Jonathan Amith. "
+                " Esta demo es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
+        "en": "The language identification uses the model"
+                " [lang-id-commonlanguage-ecapa from Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
+                " and here it is assumed that if the language is not Spanish, it must be the indigenous language of the context."
+                "\n\n"
+                "Chatino: Test of speech-to-text for Highland Chatino (Quiahije) "
+                " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
+                " with [the data compiled by Hilaria Cruz and collaborators](https://gorilla.linguistlist.org/code/ctp/)."
+                "\n\n"
+                "Mixtec: Test of speech-to-text for Yoloxochitl Mixtec,"
+                " using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
+                " with [the data compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
+                "\n\n"
+                "Totonac: Test of speech-to-text for Highland Totonac,"
+                " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
+                " with [the data compiled by Osbel López Francisco and collaborators](https://www.openslr.org/107)."
+                "\n\n"
+                "The examples come from the Jonathan Amith's [DEMCA](https://demca.mesolex.org/) project. "
+                " This demo is based on the one for [Ukrainian](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
+    },
+    "languages": {
+        "mixteco": {
+            "es": "mixteco",
+            "en": "Mixtec",
+        },
+        "chatino": {
+            "es": "chatino",
+            "en": "Chatino",
+        },
+        "totonaco": {
+            "es": "totonaco",
+            "en": "Totonac",
+        },
+        "español": {
+            "es": "español",
+            "en": "Spanish",
+        },
+        "inglés": {
+            "es": "inglés",
+            "en": "English",
+        }
+    },
+    "labels": {
+        "target": {
+            "es": "Lengua principal",
+            "en": "Primary language",
+        },
+        "input": {
+            "es": "Audio",
+            "en": "Audio",
+        },
+        "output": {
+            "es": "Resulto",
+            "en": "Result",
+        }
+    }
+}
 # initialize language ID model
 lang_classifier = EncoderClassifier.from_hparams(
     source="speechbrain/lang-id-commonlanguage_ecapa",
     savedir="pretrained_models/lang-id-commonlanguage_ecapa"
 )
+# download STT models
 model_info = {
     "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
     "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
     "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
 }
+def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
+    return pipeline("automatic-speech-recognition", model=model_path)
+def load_coqui_models(language):
+    model_path, file_name = model_info.get(language, ("", ""))
+    if not exists(file_name):
+        print(f"Downloading {model_path}")
+        r = requests.get(model_path, allow_redirects=True)
+        with open(file_name, 'wb') as file:
+            file.write(r.content)
+    else:
+        print(f"Found {file_name}. Skipping download...")
+    return Model(file_name)
 STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
+for lang in ('mixteco', 'chatino', 'totonaco'):
+    STT_MODELS[lang] = load_coqui_models(lang)
 def client(audio_data: np.array, sample_rate: int, default_lang: str):
     print(default_lang, text_lab)
     if text_lab == 'Spanish':
+        text_lab = UI_STRINGS["languages"]['español'][ui_language]
         asr_pipeline = STT_MODELS['español']
         result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
     else:
+        text_lab = UI_STRINGS["languages"][default_lang][ui_language]
         ds = STT_MODELS[default_lang]
         result = ds.stt(coqui_audio)
     return f"{text_lab}: {result}"
 def stt(default_lang: str, audio: Tuple[int, np.array]):
     sample_rate, audio = audio
     use_scorer = False
     output_audio.seek(0)
     return output_audio
+def iface(ui_language):
+    return gr.Interface(
+        fn=stt,
+        inputs=[
+            gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label=UI_STRINGS["labels"]["target"][ui_language]),
+            gr.inputs.Audio(type="numpy", label=UI_STRINGS["labels"]["input"][ui_language], source="microphone", optional=False),
+        ],
+        outputs=gr.outputs.Textbox(label=UI_STRINGS["labels"]["output"][ui_language]),
+        title=UI_STRINGS["title"][ui_language],
+        theme="huggingface",
+        description=UI_STRINGS["description"][ui_language],
+        examples=[["mixteco", "ejemplos/espanol1.wav"],
+                ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
+                ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
+                ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
+                ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
+                ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
+        article=UI_STRINGS["title"][ui_language],
+    )
+es_iface = iface('es')
+en_iface = iface('en')
+with gr.Blocks() as demo:
+    gr.Markdown('Select language of interface | Escoja lengua de la interfaz')
+    with gr.Tabs():
+        with gr.TabItem("Español"):
+            es_iface.render()
+        with gr.TabItem("English"):
+            en_iface.render()
+demo.launch()