mbarnig commited on
Commit
4a86b3f
1 Parent(s): 613a440

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import wave
4
+ from huggingface_hub import hf_hub_download
5
+ from stt import Model
6
+
7
+ state = gr.Variable()
8
+
9
+ REPO_ID = "mbarnig/lb-de-fr-en-pt-coqui-stt-models"
10
+
11
+ my_title = "🇩🇪 🇫🇷 🇬🇧 🇵🇹 Mir verstinn och Lëtzebuergesch ! 🇱🇺"
12
+ my_description = "Multilingual Speech-to-Text (STT) system understanding Lëtzebuergesch, Deutsch, Français, English and Português. My luxembourgish stt-model is based on [Coqui-STT version 1.3.0](https://github.com/coqui-ai/STT), the other models are downloaded from the [Coqui Model Zoo](https://coqui.ai/models). Thanks to 🐸 [Coqui.ai](https://https://coqui.ai/)."
13
+
14
+ STT_LANGUAGES = [
15
+ "Deutsch",
16
+ "English",
17
+ "Français",
18
+ "Lëtzebuergesch",
19
+ "Português"
20
+ ]
21
+
22
+ EXAMPLES = [
23
+ ["examples/german.wav", "Deutsch", True, "Thorsten", "wir setzen uns deshalb für eine zweistaaten lösung ein und hoffen auch dass hier fortschritte im friedensprozess gemacht werden"],
24
+ ["examples/english.wav", "English", True, "Linda", "every window and roof which could command a view of the horrible performance was occupied"],
25
+ ["examples/french.wav", "Français", True, "Bernard", "chacun avait sa part dans ces travaux suivant les prescriptions d'un règlement affiché dans la grande salle"],
26
+ ["examples/luxembourgish.wav", "Lëtzebuergesch", True, "Pit", "ma och den aarbechtsmaart muss weider wuessen fir datt de system funktionéiert déi faméis rentemauer steet schonn do ze wénken"],
27
+ ["examples/portuguese.wav", "Português", True, "Ed", "academicismo ou academismo designam originalmente o método de ensino artístico profissionalizante concebido formalizado e ministrado pelas academias de arte europeias"]
28
+ ]
29
+
30
+ def reformat_freq(sr, y):
31
+ if sr not in (
32
+ 48000,
33
+ 16000,
34
+ ): # Deepspeech only supports 16k, (we convert 48k -> 16k)
35
+ raise ValueError("Unsupported rate", sr)
36
+ if sr == 48000:
37
+ y = (
38
+ ((y / max(np.max(y), 1)) * 32767)
39
+ .reshape((-1, 3))
40
+ .mean(axis=1)
41
+ .astype("int16")
42
+ )
43
+ sr = 16000
44
+ return sr, y
45
+
46
+ def customization(language, scorer):
47
+ if language == "Lëtzebuergesch":
48
+ lb_stt_model_path = hf_hub_download(repo_id = REPO_ID, filename = "luxembourgish/model.tflite")
49
+ lb_stt_scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "luxembourgish/kenlm-luxembourgish.scorer")
50
+ myModel = Model(lb_stt_model_path)
51
+ myScorer_path = lb_stt_scorer_path
52
+ elif language == "Deutsch":
53
+ de_stt_model_path = hf_hub_download(repo_id = REPO_ID, filename = "german/model.tflite")
54
+ de_stt_scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "german/de-aashishag-1-prune-kenlm.scorer")
55
+ myModel = Model(de_stt_model_path)
56
+ myScorer_path = de_stt_scorer_path
57
+ elif language == "Français":
58
+ fr_stt_model_path = hf_hub_download(repo_id = REPO_ID, filename = "french/model.tflite")
59
+ fr_stt_scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "french/kenlm.scorer")
60
+ myModel = Model(fr_stt_model_path)
61
+ myScorer_path = fr_stt_scorer_path
62
+ elif language == "English":
63
+ en_stt_model_path = hf_hub_download(repo_id = REPO_ID, filename = "english/model.tflite")
64
+ en_stt_scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "english/huge-vocabulary.scorer")
65
+ myModel = Model(en_stt_model_path)
66
+ myScorer_path = en_stt_scorer_path
67
+ elif language == "Português":
68
+ pt_stt_model_path = hf_hub_download(repo_id = REPO_ID, filename = "portuguese/model.tflite")
69
+ pt_stt_scorer_path = hf_hub_download(repo_id = REPO_ID, filename = "portuguese/pt-itml-0-prune-kenlm.scorer")
70
+ myModel = Model(pt_stt_model_path)
71
+ myScorer_path = pt_stt_scorer_path
72
+ else:
73
+ print("Please select a language !")
74
+ return myModel, myScorer_path
75
+
76
+ def stt_upload(audio_file_path, language, scorer, speaker, groundtruth):
77
+ if audio_file_path:
78
+ acoustic_model, scorer_path = customization(language, scorer)
79
+ audio = wave.open(audio_file_path, 'r')
80
+ audio_buffer = np.frombuffer(audio.readframes(audio.getnframes()), np.int16)
81
+ if scorer:
82
+ acoustic_model.enableExternalScorer(scorer_path)
83
+ result = acoustic_model.stt(audio_buffer)
84
+ else:
85
+ acoustic_model.disableExternalScorer()
86
+ result = acoustic_model.stt(audio_buffer)
87
+ return result
88
+ else:
89
+ print("Please upload an audio file with sample-rate 16000 Hz for transcription !")
90
+
91
+ def stt_record(language, scorer, audio_record_buffer, state=""):
92
+ if audio_record_buffer:
93
+ acoustic_model, scorer_path = customization(language, scorer)
94
+ _, y = reformat_freq(*audio_record_buffer)
95
+ if scorer:
96
+ acoustic_model.enableExternalScorer(scorer_path)
97
+ result = acoustic_model.stt(y)
98
+ else:
99
+ acoustic_model.disableExternalScorer()
100
+ result = acoustic_model.stt(y)
101
+ newstate = state + result + " "
102
+ return newstate, newstate
103
+ else:
104
+ print("Please record your own speech in the selected language for transcription !")
105
+
106
+ upload_article = "<h3>User guide</h3><p>1. Click one row from the examples and view the results. Compare the transcription with the ground-truth text. 2. Clear the interface and upload your own audio-file in the selected language. The sampling-rate of the audio file must be 16000 Hz. 3. Submit an audio-file with or without a language model and compare the results. 4. Switch to the realtime-streaming STT tab and record your own speech. 5. Have fun !</p>"
107
+
108
+ record_article = "<h3>User guide</h3><p>1. Record your own speech in the selected language and view the automatic streamed transcription which is updated continuously with additionalwords. 2. Stop the recording and compare the final transcription with your input. 3. Switch to the file-upload STT tab to test the examples in the different languages. 4. Have fun!"
109
+
110
+ upload_inputs = [
111
+ gr.Audio(type="filepath", label="Upload Audio"),
112
+ gr.Radio(label="Language", choices = STT_LANGUAGES, value = "Lëtzebuergesch"),
113
+ gr.Checkbox(label="use language model", value = True),
114
+ gr.Textbox(label = "Speaker", visible=False),
115
+ gr.Textbox(label = "Groundtruth", visible=False)
116
+ ]
117
+
118
+ record_inputs = [
119
+ gr.Radio(label="Language", choices = STT_LANGUAGES, value = "Lëtzebuergesch"),
120
+ gr.Checkbox(label="use language model", value = True),
121
+ gr.Audio(source="microphone", type="numpy", label="Record Audio", streaming=True), "state"
122
+ ]
123
+
124
+ upload_outputs = gr.Textbox(lines=5, label="Transcription")
125
+
126
+ record_outputs = [gr.Textbox(lines=5, label="Transcription"), "state"]
127
+
128
+ upload_iface = gr.Interface(
129
+ fn=stt_upload,
130
+ inputs=upload_inputs,
131
+ outputs=upload_outputs,
132
+ title=my_title,
133
+ description = my_description,
134
+ article = upload_article,
135
+ examples = EXAMPLES,
136
+ allow_flagging = False
137
+ )
138
+
139
+ record_iface = gr.Interface(
140
+ fn=stt_record,
141
+ inputs=record_inputs,
142
+ outputs=record_outputs,
143
+ title=my_title,
144
+ description = my_description,
145
+ article = record_article,
146
+ allow_flagging = False,
147
+ live=True
148
+ )
149
+
150
+ iface = gr.TabbedInterface([upload_iface, record_iface], ["Text-to-Speech with audio-file upload", "Realtime Text-to-Speech"])
151
+ iface.launch()