cahya commited on
Commit
047d9d7
1 Parent(s): dd94618

add whisper

Browse files
app/data/JFK.mp3 ADDED
Binary file (223 kB). View file
 
app/data/Jokowi - 2022.mp3 ADDED
Binary file (590 kB). View file
 
app/data/Soekarno - 1963.mp3 ADDED
Binary file (573 kB). View file
 
app/whisper.py CHANGED
@@ -1,10 +1,165 @@
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
2
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
9
 
10
- demo.launch(server_port=7870)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
  import gradio as gr
3
+ from transformers import pipeline
4
+ import tempfile
5
+ from neon_tts_plugin_coqui import CoquiTTS
6
+ from datetime import datetime
7
+ import time
8
+ import psutil
9
+ from mtranslate import translate
10
+ from gpuinfo import GPUInfo
11
 
12
 
13
+ MODEL_NAME = "cahya/whisper-medium-id" # this always needs to stay in line 8 :D sorry for the hackiness
14
+ whisper_models = {
15
+ "Indonesian Whisper Tiny": {
16
+ "name": "cahya/whisper-tiny-id",
17
+ "pipe": None,
18
+ },
19
+ "Indonesian Whisper Small": {
20
+ "name": "cahya/whisper-small-id",
21
+ "pipe": None,
22
+ },
23
+ "Indonesian Whisper Medium": {
24
+ "name": "cahya/whisper-medium-id",
25
+ "pipe": None,
26
+ },
27
+ "OpenAI Whisper Medium": {
28
+ "name": "openai/whisper-medium",
29
+ "pipe": None,
30
+ },
31
+ }
32
+ lang = "id"
33
+ title = "Indonesian Whisperer"
34
+ description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
35
+ info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
36
+ badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
37
+ visitors = "https://visitor-badge.glitch.me/badge?page_id=cahya-hf-indonesian-whisperer"
38
 
39
+ languages = {
40
+ 'English': 'en',
41
+ 'German': 'de',
42
+ 'Spanish': 'es',
43
+ 'French': 'fr',
44
+ 'Portuguese': 'pt',
45
+ 'Polish': 'pl',
46
+ 'Dutch': 'nl',
47
+ 'Swedish': 'sv',
48
+ 'Italian': 'it',
49
+ 'Finnish': 'fi',
50
+ 'Ukrainian': 'uk',
51
+ 'Greek': 'el',
52
+ 'Czech': 'cs',
53
+ 'Romanian': 'ro',
54
+ 'Danish': 'da',
55
+ 'Hungarian': 'hu',
56
+ 'Croatian': 'hr',
57
+ 'Bulgarian': 'bg',
58
+ 'Lithuanian': 'lt',
59
+ 'Slovak': 'sk',
60
+ 'Latvian': 'lv',
61
+ 'Slovenian': 'sl',
62
+ 'Estonian': 'et',
63
+ 'Maltese': 'mt'
64
+ }
65
 
66
+ device = 0 if torch.cuda.is_available() else "cpu"
67
 
68
+ for model in whisper_models:
69
+ whisper_models[model]["pipe"] = pipeline(
70
+ task="automatic-speech-recognition",
71
+ model=whisper_models[model]["name"],
72
+ chunk_length_s=30,
73
+ device=device,
74
+ )
75
+ whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
76
+ whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
77
+
78
+
79
+ def transcribe(pipe, microphone, file_upload):
80
+ warn_output = ""
81
+ if (microphone is not None) and (file_upload is not None):
82
+ warn_output = (
83
+ "WARNING: You've uploaded an audio file and used the microphone. "
84
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
85
+ )
86
+
87
+ elif (microphone is None) and (file_upload is None):
88
+ return "ERROR: You have to either use the microphone or upload an audio file"
89
+
90
+ file = microphone if microphone is not None else file_upload
91
+
92
+ text = pipe(file)["text"]
93
+
94
+ return warn_output + text
95
+
96
+
97
+ LANGUAGES = list(CoquiTTS.langs.keys())
98
+ default_lang = "en"
99
+
100
+ coquiTTS = CoquiTTS()
101
+
102
+
103
+ def process(language: str, model: str, audio_microphone: str, audio_file: str):
104
+ language = languages[language]
105
+ pipe = whisper_models[model]["pipe"]
106
+ time_start = time.time()
107
+ print(f"### {datetime.now()} TTS", language, audio_file)
108
+ transcription = transcribe(pipe, audio_microphone, audio_file)
109
+ print(f"### {datetime.now()} transcribed:", transcription)
110
+ translation = translate(transcription, language, "id")
111
+ # return output
112
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
113
+ coquiTTS.get_tts(translation, fp, speaker={"language": language})
114
+ time_end = time.time()
115
+ time_diff = time_end - time_start
116
+ memory = psutil.virtual_memory()
117
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
118
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
119
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
120
+ system_info = f"""
121
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
122
+ *Processing time: {time_diff:.5} seconds.*
123
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
124
+ """
125
+ print(f"### {datetime.now()} fp.name:", fp.name)
126
+ return transcription, translation, fp.name, system_info
127
+
128
+
129
+ with gr.Blocks() as blocks:
130
+ gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
131
+ + title
132
+ + "</h1>")
133
+ gr.Markdown(description)
134
+ with gr.Row(): # equal_height=False
135
+ with gr.Column(): # variant="panel"
136
+ audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
137
+ audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
138
+ language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
139
+ model = gr.Dropdown([model for model in whisper_models.keys()],
140
+ label="Whisper Model", value="Indonesian Whisper Medium")
141
+ with gr.Row(): # mobile_collapse=False
142
+ submit = gr.Button("Submit", variant="primary")
143
+ examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
144
+ label="Examples", inputs=[audio_upload])
145
+ with gr.Column():
146
+ text_source = gr.Textbox(label="Source Language")
147
+ text_target = gr.Textbox(label="Target Language")
148
+ audio = gr.Audio(label="Target Audio", interactive=False)
149
+ memory = psutil.virtual_memory()
150
+ system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
151
+
152
+ gr.Markdown(info)
153
+ gr.Markdown("<center>"
154
+ + f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
155
+ + f'<img src={visitors} alt="visitors badge"/>'
156
+ + "</center>")
157
+
158
+ # actions
159
+ submit.click(
160
+ process,
161
+ [language, model, audio_microphone, audio_upload],
162
+ [text_source, text_target, audio, system_info],
163
+ )
164
+
165
+ blocks.launch()
requirements.txt CHANGED
@@ -2,4 +2,10 @@ gradio
2
  fastapi
3
  pydantic
4
  uvicorn
5
- websockets
 
 
 
 
 
 
 
2
  fastapi
3
  pydantic
4
  uvicorn
5
+ websockets
6
+ git+https://github.com/huggingface/transformers
7
+ torch
8
+ neon-tts-plugin-coqui==0.6.0
9
+ psutil
10
+ mtranslate
11
+ gpuinfo