File size: 5,971 Bytes
254ac8d
 
 
cac3bb3
 
 
 
 
 
206fd6a
 
cac3bb3
86c3abc
1b6ec9d
 
 
 
3354e39
1b6ec9d
ed0b2d9
85e039a
e9872f9
eedab3c
cac3bb3
64977d2
254ac8d
de64dc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254ac8d
 
1b6ec9d
 
 
 
 
 
 
 
 
254ac8d
de64dc1
1b6ec9d
254ac8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac3bb3
 
 
 
 
 
1b6ec9d
2400ec6
1b6ec9d
86c3abc
cac3bb3
1b6ec9d
85e039a
 
cac3bb3
 
 
86c3abc
 
 
206fd6a
a1448da
 
86c3abc
34fd6ff
ce254f5
a1448da
86c3abc
cac3bb3
86c3abc
cac3bb3
 
 
 
 
 
 
86c3abc
 
d0a19ad
 
de64dc1
1b6ec9d
 
86c3abc
cac3bb3
cef8aa0
 
85e039a
d0a19ad
 
 
86c3abc
 
cef8aa0
cac3bb3
 
86c3abc
64977d2
86c3abc
cac3bb3
 
 
1b6ec9d
 
86c3abc
254ac8d
 
3354e39
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
from gpuinfo import GPUInfo


MODEL_NAME = "cahya/whisper-medium-id"  # this always needs to stay in line 8 :D sorry for the hackiness
whisper_models = {
    "Indonesian Whisper Medium": {
        "name": "cahya/whisper-medium-id",
        "pipe": None,
    }
}
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
visitors = "https://visitor-badge.glitch.me/badge?page_id=cahya-hf-indonesian-whisperer"

languages = {
    'English': 'en',
    'German': 'de',
    'Spanish': 'es',
    'French': 'fr',
    'Portuguese': 'pt',
    'Polish': 'pl',
    'Dutch': 'nl',
    'Swedish': 'sv',
    'Italian': 'it',
    'Finnish': 'fi',
    'Ukrainian': 'uk',
    'Greek': 'el',
    'Czech': 'cs',
    'Romanian': 'ro',
    'Danish': 'da',
    'Hungarian': 'hu',
    'Croatian': 'hr',
    'Bulgarian': 'bg',
    'Lithuanian': 'lt',
    'Slovak': 'sk',
    'Latvian': 'lv',
    'Slovenian': 'sl',
    'Estonian': 'et',
    'Maltese': 'mt'
}

device = 0 if torch.cuda.is_available() else "cpu"

for model in whisper_models:
    whisper_models[model]["pipe"] = pipeline(
        task="automatic-speech-recognition",
        model=whisper_models[model]["name"],
        chunk_length_s=30,
        device=device,
    )
    whisper_models[model]["pipe"].model.config.forced_decoder_ids = \
        whisper_models[model]["pipe"].tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")


def transcribe(pipe, microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"

coquiTTS = CoquiTTS()


def process(language: str, model: str, audio_microphone: str, audio_file: str):
    language = languages[language]
    pipe = whisper_models[model]["pipe"]
    time_start = time.time()
    print(f"### {datetime.now()} TTS", language, audio_file)
    transcription = transcribe(pipe, audio_microphone, audio_file)
    print(f"### {datetime.now()} transcribed:", transcription)
    translation = translate(transcription, language, "id")
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(translation, fp, speaker={"language": language})
        time_end = time.time()
        time_diff = time_end - time_start
        memory = psutil.virtual_memory()
        gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
        gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
        gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
        system_info = f"""
        *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.* 
        *Processing time: {time_diff:.5} seconds.*
        *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
        """
        print(f"### {datetime.now()} fp.name:", fp.name)
        return transcription, translation, fp.name, system_info


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    gr.Markdown(description)
    with gr.Row():  # equal_height=False
        with gr.Column():  # variant="panel"
            audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
            audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
            language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
            model = gr.Dropdown([model for model in whisper_models.keys()],
                                     label="Whisper Model", value="Indonesian Whisper Medium")
            with gr.Row():  # mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
            examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
                                   label="Examples", inputs=[audio_upload])
        with gr.Column():
            text_source = gr.Textbox(label="Source Language")
            text_target = gr.Textbox(label="Target Language")
            audio = gr.Audio(label="Target Audio", interactive=False)
            memory = psutil.virtual_memory()
            system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")

    gr.Markdown(info)
    gr.Markdown("<center>"
                + f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
                + f'<img src={visitors} alt="visitors badge"/>'
                + "</center>")

    # actions
    submit.click(
        process,
        [language, model, audio_microphone, audio_upload],
        [text_source, text_target, audio, system_info],
    )

blocks.launch(server_port=7870)