Spaces:
Runtime error
Runtime error
File size: 5,971 Bytes
254ac8d cac3bb3 206fd6a cac3bb3 86c3abc 1b6ec9d 3354e39 1b6ec9d ed0b2d9 85e039a e9872f9 eedab3c cac3bb3 64977d2 254ac8d de64dc1 254ac8d 1b6ec9d 254ac8d de64dc1 1b6ec9d 254ac8d cac3bb3 1b6ec9d 2400ec6 1b6ec9d 86c3abc cac3bb3 1b6ec9d 85e039a cac3bb3 86c3abc 206fd6a a1448da 86c3abc 34fd6ff ce254f5 a1448da 86c3abc cac3bb3 86c3abc cac3bb3 86c3abc d0a19ad de64dc1 1b6ec9d 86c3abc cac3bb3 cef8aa0 85e039a d0a19ad 86c3abc cef8aa0 cac3bb3 86c3abc 64977d2 86c3abc cac3bb3 1b6ec9d 86c3abc 254ac8d 3354e39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
from gpuinfo import GPUInfo
MODEL_NAME = "cahya/whisper-medium-id" # this always needs to stay in line 8 :D sorry for the hackiness
whisper_models = {
"Indonesian Whisper Medium": {
"name": "cahya/whisper-medium-id",
"pipe": None,
}
}
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
visitors = "https://visitor-badge.glitch.me/badge?page_id=cahya-hf-indonesian-whisperer"
languages = {
'English': 'en',
'German': 'de',
'Spanish': 'es',
'French': 'fr',
'Portuguese': 'pt',
'Polish': 'pl',
'Dutch': 'nl',
'Swedish': 'sv',
'Italian': 'it',
'Finnish': 'fi',
'Ukrainian': 'uk',
'Greek': 'el',
'Czech': 'cs',
'Romanian': 'ro',
'Danish': 'da',
'Hungarian': 'hu',
'Croatian': 'hr',
'Bulgarian': 'bg',
'Lithuanian': 'lt',
'Slovak': 'sk',
'Latvian': 'lv',
'Slovenian': 'sl',
'Estonian': 'et',
'Maltese': 'mt'
}
device = 0 if torch.cuda.is_available() else "cpu"
for model in whisper_models:
whisper_models[model]["pipe"] = pipeline(
task="automatic-speech-recognition",
model=whisper_models[model]["name"],
chunk_length_s=30,
device=device,
)
whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def transcribe(pipe, microphone, file_upload):
warn_output = ""
if (microphone is not None) and (file_upload is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
elif (microphone is None) and (file_upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
file = microphone if microphone is not None else file_upload
text = pipe(file)["text"]
return warn_output + text
LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"
coquiTTS = CoquiTTS()
def process(language: str, model: str, audio_microphone: str, audio_file: str):
language = languages[language]
pipe = whisper_models[model]["pipe"]
time_start = time.time()
print(f"### {datetime.now()} TTS", language, audio_file)
transcription = transcribe(pipe, audio_microphone, audio_file)
print(f"### {datetime.now()} transcribed:", transcription)
translation = translate(transcription, language, "id")
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(translation, fp, speaker={"language": language})
time_end = time.time()
time_diff = time_end - time_start
memory = psutil.virtual_memory()
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
system_info = f"""
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
*Processing time: {time_diff:.5} seconds.*
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
"""
print(f"### {datetime.now()} fp.name:", fp.name)
return transcription, translation, fp.name, system_info
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Row(): # equal_height=False
with gr.Column(): # variant="panel"
audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
model = gr.Dropdown([model for model in whisper_models.keys()],
label="Whisper Model", value="Indonesian Whisper Medium")
with gr.Row(): # mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
label="Examples", inputs=[audio_upload])
with gr.Column():
text_source = gr.Textbox(label="Source Language")
text_target = gr.Textbox(label="Target Language")
audio = gr.Audio(label="Target Audio", interactive=False)
memory = psutil.virtual_memory()
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
gr.Markdown(info)
gr.Markdown("<center>"
+ f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
+ f'<img src={visitors} alt="visitors badge"/>'
+ "</center>")
# actions
submit.click(
process,
[language, model, audio_microphone, audio_upload],
[text_source, text_target, audio, system_info],
)
blocks.launch(server_port=7870)
|