lucio's picture
newest releast doesn't support tabs yet
c1358b8
raw history blame
No virus
9.26 kB
from io import BytesIO
from typing import Tuple
import wave
import gradio as gr
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model
import torch
from transformers import pipeline
import librosa
import torchaudio
from speechbrain.pretrained import EncoderClassifier
UI_STRINGS = {
"title": {
"es": "Reconocimiento de Dictado en Chatino, Mixteco, Totonaco y Español",
"en": "Speech recognition in Chatino, Mixtec, Totonac and Spanish",
},
"description": {
"es": "Una demo de identificar frases del español y de tres lenguas indígenas de México, y proveer el texto de cada una",
"en": "A demo of identifying phrases in Spanish and three Mexican indigenous languages, and providing transcripts of each",
},
"article": {
"es": "La identificación de lenguas usa el modelo"
" [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
" y aquí se supone que si la lengua no es español, debe ser la lengua indígena del contexto."
"\n\n"
"Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" con [los datos recopilados por Hilaria Cruz y sus colaboradores](https://gorilla.linguistlist.org/code/ctp/)."
"\n\n"
"Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
" usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
" \n\n"
"Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
" con [los datos recopilados por Osbel López Francisco y sus colaboradores](https://www.openslr.org/107)."
" \n\n"
"Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/) de Jonathan Amith. "
" Esta demo es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
"en": "The language identification uses the model"
" [lang-id-commonlanguage-ecapa from Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
" and here it is assumed that if the language is not Spanish, it must be the indigenous language of the context."
"\n\n"
"Chatino: Test of speech-to-text for Highland Chatino (Quiahije) "
" using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" with [the data compiled by Hilaria Cruz and collaborators](https://gorilla.linguistlist.org/code/ctp/)."
"\n\n"
"Mixtec: Test of speech-to-text for Yoloxochitl Mixtec,"
" using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" with [the data compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
"\n\n"
"Totonac: Test of speech-to-text for Highland Totonac,"
" using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" with [the data compiled by Osbel López Francisco and collaborators](https://www.openslr.org/107)."
"\n\n"
"The examples come from the Jonathan Amith's [DEMCA](https://demca.mesolex.org/) project. "
" This demo is based on the one for [Ukrainian](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
},
"languages": {
"mixteco": {
"es": "mixteco",
"en": "Mixtec",
},
"chatino": {
"es": "chatino",
"en": "Chatino",
},
"totonaco": {
"es": "totonaco",
"en": "Totonac",
},
"español": {
"es": "español",
"en": "Spanish",
},
"inglés": {
"es": "inglés",
"en": "English",
}
},
"labels": {
"target": {
"es": "Lengua principal",
"en": "Primary language",
},
"input": {
"es": "Audio",
"en": "Audio",
},
"output": {
"es": "Resulto",
"en": "Result",
}
}
}
# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-commonlanguage_ecapa",
savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)
# download STT models
model_info = {
"mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
"chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
"totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
"español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
"inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}
def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
return pipeline("automatic-speech-recognition", model=model_path)
def load_coqui_models(language):
model_path, file_name = model_info.get(language, ("", ""))
if not exists(file_name):
print(f"Downloading {model_path}")
r = requests.get(model_path, allow_redirects=True)
with open(file_name, 'wb') as file:
file.write(r.content)
else:
print(f"Found {file_name}. Skipping download...")
return Model(file_name)
STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
for lang in ('mixteco', 'chatino', 'totonaco'):
STT_MODELS[lang] = load_coqui_models(lang)
def client(audio_data: np.array, sample_rate: int, default_lang: str):
output_audio = _convert_audio(audio_data, sample_rate)
waveform, _ = torchaudio.load(output_audio)
out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
text_lab = text_lab[0]
output_audio.seek(0)
fin = wave.open(output_audio, 'rb')
coqui_audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
output_audio.seek(0)
hf_audio, _ = librosa.load(output_audio)
fin.close()
print(default_lang, text_lab)
if text_lab == 'Spanish':
text_lab = UI_STRINGS["languages"]['español'][ui_language]
asr_pipeline = STT_MODELS['español']
result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
else:
text_lab = UI_STRINGS["languages"][default_lang][ui_language]
ds = STT_MODELS[default_lang]
result = ds.stt(coqui_audio)
return f"{text_lab}: {result}"
def stt(default_lang: str, audio: Tuple[int, np.array]):
sample_rate, audio = audio
use_scorer = False
recognized_result = client(audio, sample_rate, default_lang)
return recognized_result
def _convert_audio(audio_data: np.array, sample_rate: int):
source_audio = BytesIO()
source_audio.write(audio_data)
source_audio.seek(0)
output_audio = BytesIO()
wav_file = AudioSegment.from_raw(
source_audio,
channels=1,
sample_width=2,
frame_rate=sample_rate
)
wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
output_audio.seek(0)
return output_audio
def iface(ui_language):
return gr.Interface(
fn=stt,
inputs=[
gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label=UI_STRINGS["labels"]["target"][ui_language]),
gr.inputs.Audio(type="numpy", label=UI_STRINGS["labels"]["input"][ui_language], source="microphone", optional=False),
gr.inputs.State()
],
outputs=gr.outputs.Textbox(label=UI_STRINGS["labels"]["output"][ui_language]),
title=UI_STRINGS["title"][ui_language],
theme="huggingface",
description=UI_STRINGS["description"][ui_language],
examples=[["mixteco", "ejemplos/espanol1.wav", "español: "],
["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
article=UI_STRINGS["title"][ui_language],
)
es_iface = iface('es')
es_iface.launch()