lucio commited on
Commit
a1f131a
1 Parent(s): 8997b10

try using tabs

Browse files
Files changed (1) hide show
  1. app.py +139 -60
app.py CHANGED
@@ -14,16 +14,100 @@ import librosa
14
  import torchaudio
15
  from speechbrain.pretrained import EncoderClassifier
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # initialize language ID model
18
  lang_classifier = EncoderClassifier.from_hparams(
19
  source="speechbrain/lang-id-commonlanguage_ecapa",
20
  savedir="pretrained_models/lang-id-commonlanguage_ecapa"
21
  )
22
 
23
- def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
24
- return pipeline("automatic-speech-recognition", model=model_path)
25
 
26
- # download STT model
27
  model_info = {
28
  "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
29
  "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
@@ -32,7 +116,28 @@ model_info = {
32
  "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
33
  }
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
 
 
36
 
37
 
38
  def client(audio_data: np.array, sample_rate: int, default_lang: str):
@@ -52,37 +157,19 @@ def client(audio_data: np.array, sample_rate: int, default_lang: str):
52
  print(default_lang, text_lab)
53
 
54
  if text_lab == 'Spanish':
55
- text_lab = 'español'
56
 
57
  asr_pipeline = STT_MODELS['español']
58
  result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
59
 
60
  else:
61
- text_lab = default_lang
62
  ds = STT_MODELS[default_lang]
63
  result = ds.stt(coqui_audio)
64
 
65
  return f"{text_lab}: {result}"
66
 
67
 
68
- def load_coqui_models(language):
69
-
70
- model_path, file_name = model_info.get(language, ("", ""))
71
-
72
- if not exists(file_name):
73
- print(f"Downloading {model_path}")
74
- r = requests.get(model_path, allow_redirects=True)
75
- with open(file_name, 'wb') as file:
76
- file.write(r.content)
77
- else:
78
- print(f"Found {file_name}. Skipping download...")
79
- return Model(file_name)
80
-
81
- for lang in ('mixteco', 'chatino', 'totonaco'):
82
- STT_MODELS[lang] = load_coqui_models(lang)
83
-
84
-
85
-
86
  def stt(default_lang: str, audio: Tuple[int, np.array]):
87
  sample_rate, audio = audio
88
  use_scorer = False
@@ -107,43 +194,35 @@ def _convert_audio(audio_data: np.array, sample_rate: int):
107
  output_audio.seek(0)
108
  return output_audio
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- iface = gr.Interface(
112
- fn=stt,
113
- inputs=[
114
- gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
115
- gr.inputs.Audio(type="numpy", label="Audio", optional=False),
116
- ],
117
- outputs=gr.outputs.Textbox(label="Output"),
118
- title="Coqui STT de Chatino, Mixteco, y Totonaco",
119
- theme="huggingface",
120
- description="Prueba de identificar frases del español en grabaciones de una lengua indígena, y prover el texto de cada una",
121
- examples=[["mixteco", "ejemplos/espanol1.wav"],
122
- ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
123
- ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
124
- ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
125
- ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
126
- ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
127
- article="La identificación de lenguas usa el modelo"
128
- " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
129
- " y aquí se supone que si la lengua no es español, debe ser la lengua principal del contexto."
130
- "\n\n"
131
- "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
132
- " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
133
- " con [los datos recopilados por Hilaria Cruz y sys colaboradores](https://gorilla.linguistlist.org/code/ctp/)"
134
- "\n\n"
135
- "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
136
- " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
137
- " con [los datos recopilados por Rey Castillo, Jonathan Amith y sus colaboradores](https://www.openslr.org/89)."
138
- " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
139
- " \n\n"
140
- "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
141
- " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
142
- " con [los datos recopilados por Osbel López Francisco y Jonathan Amith](https://www.openslr.org/107)."
143
- " \n\n"
144
- "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/). "
145
- " Esta prueba es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt)."
146
- )
147
 
 
 
 
 
 
 
 
148
 
149
- iface.launch()
 
14
  import torchaudio
15
  from speechbrain.pretrained import EncoderClassifier
16
 
17
+ UI_STRINGS = {
18
+ "title": {
19
+ "es": "Reconocimiento de Dictado en Chatino, Mixteco, Totonaco y Español",
20
+ "en": "Speech recognition in Chatino, Mixtec, Totonac and Spanish",
21
+ },
22
+ "description": {
23
+ "es": "Una demo de identificar frases del español y de tres lenguas indígenas de México, y proveer el texto de cada una",
24
+ "en": "A demo of identifying phrases in Spanish and three Mexican indigenous languages, and providing transcripts of each",
25
+ },
26
+ "article": {
27
+ "es": "La identificación de lenguas usa el modelo"
28
+ " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
29
+ " y aquí se supone que si la lengua no es español, debe ser la lengua indígena del contexto."
30
+ "\n\n"
31
+ "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
32
+ " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
33
+ " con [los datos recopilados por Hilaria Cruz y sus colaboradores](https://gorilla.linguistlist.org/code/ctp/)."
34
+ "\n\n"
35
+ "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
36
+ " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
37
+ " con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
38
+ " \n\n"
39
+ "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
40
+ " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
41
+ " con [los datos recopilados por Osbel López Francisco y sus colaboradores](https://www.openslr.org/107)."
42
+ " \n\n"
43
+ "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/) de Jonathan Amith. "
44
+ " Esta demo es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
45
+ "en": "The language identification uses the model"
46
+ " [lang-id-commonlanguage-ecapa from Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
47
+ " and here it is assumed that if the language is not Spanish, it must be the indigenous language of the context."
48
+ "\n\n"
49
+ "Chatino: Test of speech-to-text for Highland Chatino (Quiahije) "
50
+ " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
51
+ " with [the data compiled by Hilaria Cruz and collaborators](https://gorilla.linguistlist.org/code/ctp/)."
52
+ "\n\n"
53
+ "Mixtec: Test of speech-to-text for Yoloxochitl Mixtec,"
54
+ " using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
55
+ " with [the data compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
56
+ "\n\n"
57
+ "Totonac: Test of speech-to-text for Highland Totonac,"
58
+ " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
59
+ " with [the data compiled by Osbel López Francisco and collaborators](https://www.openslr.org/107)."
60
+ "\n\n"
61
+ "The examples come from the Jonathan Amith's [DEMCA](https://demca.mesolex.org/) project. "
62
+ " This demo is based on the one for [Ukrainian](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
63
+ },
64
+ "languages": {
65
+ "mixteco": {
66
+ "es": "mixteco",
67
+ "en": "Mixtec",
68
+ },
69
+ "chatino": {
70
+ "es": "chatino",
71
+ "en": "Chatino",
72
+ },
73
+ "totonaco": {
74
+ "es": "totonaco",
75
+ "en": "Totonac",
76
+ },
77
+ "español": {
78
+ "es": "español",
79
+ "en": "Spanish",
80
+ },
81
+ "inglés": {
82
+ "es": "inglés",
83
+ "en": "English",
84
+ }
85
+ },
86
+ "labels": {
87
+ "target": {
88
+ "es": "Lengua principal",
89
+ "en": "Primary language",
90
+ },
91
+ "input": {
92
+ "es": "Audio",
93
+ "en": "Audio",
94
+ },
95
+ "output": {
96
+ "es": "Resulto",
97
+ "en": "Result",
98
+ }
99
+ }
100
+ }
101
+
102
+
103
  # initialize language ID model
104
  lang_classifier = EncoderClassifier.from_hparams(
105
  source="speechbrain/lang-id-commonlanguage_ecapa",
106
  savedir="pretrained_models/lang-id-commonlanguage_ecapa"
107
  )
108
 
 
 
109
 
110
+ # download STT models
111
  model_info = {
112
  "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
113
  "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
 
116
  "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
117
  }
118
 
119
+
120
+ def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
121
+ return pipeline("automatic-speech-recognition", model=model_path)
122
+
123
+
124
+ def load_coqui_models(language):
125
+
126
+ model_path, file_name = model_info.get(language, ("", ""))
127
+
128
+ if not exists(file_name):
129
+ print(f"Downloading {model_path}")
130
+ r = requests.get(model_path, allow_redirects=True)
131
+ with open(file_name, 'wb') as file:
132
+ file.write(r.content)
133
+ else:
134
+ print(f"Found {file_name}. Skipping download...")
135
+ return Model(file_name)
136
+
137
+
138
  STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
139
+ for lang in ('mixteco', 'chatino', 'totonaco'):
140
+ STT_MODELS[lang] = load_coqui_models(lang)
141
 
142
 
143
  def client(audio_data: np.array, sample_rate: int, default_lang: str):
 
157
  print(default_lang, text_lab)
158
 
159
  if text_lab == 'Spanish':
160
+ text_lab = UI_STRINGS["languages"]['español'][ui_language]
161
 
162
  asr_pipeline = STT_MODELS['español']
163
  result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
164
 
165
  else:
166
+ text_lab = UI_STRINGS["languages"][default_lang][ui_language]
167
  ds = STT_MODELS[default_lang]
168
  result = ds.stt(coqui_audio)
169
 
170
  return f"{text_lab}: {result}"
171
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  def stt(default_lang: str, audio: Tuple[int, np.array]):
174
  sample_rate, audio = audio
175
  use_scorer = False
 
194
  output_audio.seek(0)
195
  return output_audio
196
 
197
+ def iface(ui_language):
198
+ return gr.Interface(
199
+ fn=stt,
200
+ inputs=[
201
+ gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label=UI_STRINGS["labels"]["target"][ui_language]),
202
+ gr.inputs.Audio(type="numpy", label=UI_STRINGS["labels"]["input"][ui_language], source="microphone", optional=False),
203
+ ],
204
+ outputs=gr.outputs.Textbox(label=UI_STRINGS["labels"]["output"][ui_language]),
205
+ title=UI_STRINGS["title"][ui_language],
206
+ theme="huggingface",
207
+ description=UI_STRINGS["description"][ui_language],
208
+ examples=[["mixteco", "ejemplos/espanol1.wav"],
209
+ ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
210
+ ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
211
+ ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
212
+ ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
213
+ ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
214
+ article=UI_STRINGS["title"][ui_language],
215
+ )
216
 
217
+ es_iface = iface('es')
218
+ en_iface = iface('en')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ with gr.Blocks() as demo:
221
+ gr.Markdown('Select language of interface | Escoja lengua de la interfaz')
222
+ with gr.Tabs():
223
+ with gr.TabItem("Español"):
224
+ es_iface.render()
225
+ with gr.TabItem("English"):
226
+ en_iface.render()
227
 
228
+ demo.launch()