File size: 6,232 Bytes
9439387
dbf2fc2
 
d0bbc40
dbf2fc2
 
 
 
f0380ff
dbf2fc2
f4b3d1b
f200d27
028ff01
f09c038
 
dbf2fc2
f09c038
f4b3d1b
 
 
 
f09c038
2915c9d
ac40f21
f09c038
 
f4b3d1b
 
 
 
 
 
 
 
4639cf2
2915c9d
dbf2fc2
f4b3d1b
dbf2fc2
bb12448
 
ff08b05
f4b3d1b
8ee61a8
dbf2fc2
028ff01
 
 
 
dbf2fc2
 
5915225
dbf2fc2
f4b3d1b
ff08b05
028ff01
ac40f21
028ff01
dbf2fc2
f4b3d1b
ff08b05
f4b3d1b
028ff01
dbf2fc2
f09c038
dbf2fc2
 
2915c9d
f4b3d1b
d27ee9b
dbf2fc2
d47cc89
 
 
 
 
 
 
 
f4b3d1b
2915c9d
 
f4b3d1b
 
 
 
dbf2fc2
 
 
f4b3d1b
dbf2fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b3d1b
dbf2fc2
 
 
 
 
 
 
f4b3d1b
 
dbf2fc2
 
ff08b05
dbf2fc2
652611b
ccf8b98
652611b
 
 
 
 
e05a9ec
 
 
 
 
ff08b05
 
 
 
f09c038
ff08b05
f09c038
 
ff08b05
 
 
 
4639cf2
ff08b05
dbf2fc2
 
f4b3d1b
dbf2fc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from io import BytesIO   
from typing import Tuple
import wave
import gradio as gr 
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model  

import torch
from transformers import pipeline
import librosa
import torchaudio
from speechbrain.pretrained import EncoderClassifier

# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
    source="speechbrain/lang-id-commonlanguage_ecapa", 
    savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)

def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
    return pipeline("automatic-speech-recognition", model=model_path)

# download STT model
model_info = {
    "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
    "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
    "totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
    "español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
    "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}

STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}


def client(audio_data: np.array, sample_rate: int, default_lang: str):
    output_audio = _convert_audio(audio_data, sample_rate)
    waveform, _ = torchaudio.load(output_audio)
    out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
    text_lab = text_lab[0]

    output_audio.seek(0)
    fin = wave.open(output_audio, 'rb')
    coqui_audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    output_audio.seek(0)
    hf_audio, _ = librosa.load(output_audio)

    fin.close()
    print(default_lang, text_lab)

    if text_lab == 'Spanish':
        text_lab = 'español'

        asr_pipeline = STT_MODELS['español']
        result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']

    else:
        text_lab = default_lang
        ds = STT_MODELS[default_lang]
        result = ds.stt(coqui_audio)

    return f"{text_lab}: {result}"


def load_coqui_models(language):

    model_path, file_name = model_info.get(language, ("", ""))

    if not exists(file_name):
        print(f"Downloading {model_path}")
        r = requests.get(model_path, allow_redirects=True)
        with open(file_name, 'wb') as file:
            file.write(r.content)
    else:
        print(f"Found {file_name}. Skipping download...")
    return Model(file_name)

for lang in ('mixteco', 'chatino', 'totonaco'):
    STT_MODELS[lang] = load_coqui_models(lang)



def stt(default_lang: str, audio: Tuple[int, np.array]):
    sample_rate, audio = audio
    use_scorer = False

    recognized_result = client(audio, sample_rate, default_lang)

    return recognized_result


def _convert_audio(audio_data: np.array, sample_rate: int):
    source_audio = BytesIO()
    source_audio.write(audio_data)
    source_audio.seek(0)
    output_audio = BytesIO()
    wav_file = AudioSegment.from_raw(
        source_audio,
        channels=1,
        sample_width=2,
        frame_rate=sample_rate
    )
    wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
    output_audio.seek(0)
    return output_audio


iface = gr.Interface(
    fn=stt,
    inputs=[
        gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
        gr.inputs.Audio(type="numpy", label="Audio", optional=False),
    ],
    outputs=gr.outputs.Textbox(label="Output"),
    title="Coqui STT de Chatino, Mixteco, y Totonaco",
    theme="huggingface",
    description="Prueba de identificar frases del español en grabaciones de una lengua indígena, y prover el texto de cada una",
    examples=[["mixteco", "ejemplos/espanol1.wav"],
            ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
            ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
            ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
            ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
            ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
    article="La identificación de lenguas usa el modelo"
                " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
                " y aquí se supone que si la lengua no es español, debe ser la lengua principal del contexto."
                "\n\n"
                "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
                " con [los datos recopilados por Hilaria Cruz y sys colaboradores](https://gorilla.linguistlist.org/code/ctp/)"
                "\n\n"
                "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
                " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
                " con [los datos recopilados por Rey Castillo, Jonathan Amith y sus colaboradores](https://www.openslr.org/89)."
                " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
                " \n\n"
                "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
                " con [los datos recopilados por Osbel López Francisco y Jonathan Amith](https://www.openslr.org/107)."
                " \n\n"
                "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/). "
                " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
)


iface.launch()