File size: 5,966 Bytes
047d9d7 41649f8 047d9d7 41649f8 047d9d7 41649f8 047d9d7 41649f8 047d9d7 41649f8 047d9d7 aba68ce 047d9d7 d8d7b67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
from gpuinfo import GPUInfo
MODEL_NAME = "cahya/whisper-medium-id" # this always needs to stay in line 8 :D sorry for the hackiness
whisper_models = {
"Indonesian Whisper Tiny": {
"name": "cahya/whisper-tiny-id",
"pipe": None,
},
}
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
visitors = "https://visitor-badge.glitch.me/badge?page_id=cahya-hf-indonesian-whisperer"
languages = {
'English': 'en',
'German': 'de',
'Spanish': 'es',
'French': 'fr',
'Portuguese': 'pt',
'Polish': 'pl',
'Dutch': 'nl',
'Swedish': 'sv',
'Italian': 'it',
'Finnish': 'fi',
'Ukrainian': 'uk',
'Greek': 'el',
'Czech': 'cs',
'Romanian': 'ro',
'Danish': 'da',
'Hungarian': 'hu',
'Croatian': 'hr',
'Bulgarian': 'bg',
'Lithuanian': 'lt',
'Slovak': 'sk',
'Latvian': 'lv',
'Slovenian': 'sl',
'Estonian': 'et',
'Maltese': 'mt'
}
device = 0 if torch.cuda.is_available() else "cpu"
for model in whisper_models:
whisper_models[model]["pipe"] = pipeline(
task="automatic-speech-recognition",
model=whisper_models[model]["name"],
chunk_length_s=30,
device=device,
)
whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def transcribe(pipe, microphone, file_upload):
warn_output = ""
if (microphone is not None) and (file_upload is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
elif (microphone is None) and (file_upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
file = microphone if microphone is not None else file_upload
text = pipe(file)["text"]
return warn_output + text
LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"
coquiTTS = CoquiTTS()
def process(language: str, model: str, audio_microphone: str, audio_file: str):
language = languages[language]
pipe = whisper_models[model]["pipe"]
time_start = time.time()
print(f"### {datetime.now()} TTS", language, audio_file)
transcription = transcribe(pipe, audio_microphone, audio_file)
print(f"### {datetime.now()} transcribed:", transcription)
translation = translate(transcription, language, "id")
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(translation, fp, speaker={"language": language})
time_end = time.time()
time_diff = time_end - time_start
memory = psutil.virtual_memory()
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
system_info = f"""
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
*Processing time: {time_diff:.5} seconds.*
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
"""
print(f"### {datetime.now()} fp.name:", fp.name)
return transcription, translation, fp.name, system_info
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Row(): # equal_height=False
with gr.Column(): # variant="panel"
audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
model = gr.Dropdown([model for model in whisper_models.keys()],
label="Whisper Model", value="Indonesian Whisper Tiny")
with gr.Row(): # mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
label="Examples", inputs=[audio_upload])
with gr.Column():
text_source = gr.Textbox(label="Source Language")
text_target = gr.Textbox(label="Target Language")
audio = gr.Audio(label="Target Audio", interactive=False)
memory = psutil.virtual_memory()
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
gr.Markdown(info)
gr.Markdown("<center>"
+ f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
+ f'<img src={visitors} alt="visitors badge"/>'
+ "</center>")
# actions
submit.click(
process,
[language, model, audio_microphone, audio_upload],
[text_source, text_target, audio, system_info],
)
blocks.launch(server_port=7870)
|