File size: 5,966 Bytes
047d9d7
41649f8
047d9d7
 
 
 
 
 
 
 
41649f8
 
047d9d7
 
 
 
 
 
 
 
 
 
 
 
 
41649f8
047d9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41649f8
047d9d7
41649f8
047d9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aba68ce
047d9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d7b67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
from gpuinfo import GPUInfo


MODEL_NAME = "cahya/whisper-medium-id"  # this always needs to stay in line 8 :D sorry for the hackiness
whisper_models = {
    "Indonesian Whisper Tiny": {
        "name": "cahya/whisper-tiny-id",
        "pipe": None,
    },
}
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
visitors = "https://visitor-badge.glitch.me/badge?page_id=cahya-hf-indonesian-whisperer"

languages = {
    'English': 'en',
    'German': 'de',
    'Spanish': 'es',
    'French': 'fr',
    'Portuguese': 'pt',
    'Polish': 'pl',
    'Dutch': 'nl',
    'Swedish': 'sv',
    'Italian': 'it',
    'Finnish': 'fi',
    'Ukrainian': 'uk',
    'Greek': 'el',
    'Czech': 'cs',
    'Romanian': 'ro',
    'Danish': 'da',
    'Hungarian': 'hu',
    'Croatian': 'hr',
    'Bulgarian': 'bg',
    'Lithuanian': 'lt',
    'Slovak': 'sk',
    'Latvian': 'lv',
    'Slovenian': 'sl',
    'Estonian': 'et',
    'Maltese': 'mt'
}

device = 0 if torch.cuda.is_available() else "cpu"

for model in whisper_models:
    whisper_models[model]["pipe"] = pipeline(
        task="automatic-speech-recognition",
        model=whisper_models[model]["name"],
        chunk_length_s=30,
        device=device,
    )
    whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
        whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")


def transcribe(pipe, microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"

coquiTTS = CoquiTTS()


def process(language: str, model: str, audio_microphone: str, audio_file: str):
    language = languages[language]
    pipe = whisper_models[model]["pipe"]
    time_start = time.time()
    print(f"### {datetime.now()} TTS", language, audio_file)
    transcription = transcribe(pipe, audio_microphone, audio_file)
    print(f"### {datetime.now()} transcribed:", transcription)
    translation = translate(transcription, language, "id")
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(translation, fp, speaker={"language": language})
        time_end = time.time()
        time_diff = time_end - time_start
        memory = psutil.virtual_memory()
        gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
        gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
        gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
        system_info = f"""
        *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.* 
        *Processing time: {time_diff:.5} seconds.*
        *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
        """
        print(f"### {datetime.now()} fp.name:", fp.name)
        return transcription, translation, fp.name, system_info


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    gr.Markdown(description)
    with gr.Row():  # equal_height=False
        with gr.Column():  # variant="panel"
            audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
            audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
            language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
            model = gr.Dropdown([model for model in whisper_models.keys()],
                                     label="Whisper Model", value="Indonesian Whisper Tiny")
            with gr.Row():  # mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
            examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
                                   label="Examples", inputs=[audio_upload])
        with gr.Column():
            text_source = gr.Textbox(label="Source Language")
            text_target = gr.Textbox(label="Target Language")
            audio = gr.Audio(label="Target Audio", interactive=False)
            memory = psutil.virtual_memory()
            system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")

    gr.Markdown(info)
    gr.Markdown("<center>"
                + f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
                + f'<img src={visitors} alt="visitors badge"/>'
                + "</center>")

    # actions
    submit.click(
        process,
        [language, model, audio_microphone, audio_upload],
        [text_source, text_target, audio, system_info],
    )

blocks.launch(server_port=7870)