File size: 2,751 Bytes
e3dc46b
 
 
 
 
 
 
 
faf1b1b
e3dc46b
 
 
 
 
 
 
7050f49
 
 
 
 
 
 
 
 
 
 
 
 
e3dc46b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72a34c4
 
7050f49
 
e3dc46b
 
 
faf1b1b
e3dc46b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# import gradio as gr

# gr.Interface.load("models/facebook/fastspeech2-en-ljspeech").launch()

# import gradio as gr

# gr.Interface.load("models/openai/whisper-large-v2").launch()

import gradio as gr
import torch.cuda
import whisper
from whisper.tokenizer import LANGUAGES

gpu = torch.cuda.is_available()
model = None

# DESCRIPTION = """
# <div style="display:flex; gap: 1em; justify-content: center; align-items: center;">
#     <a target="_blank" href="https://github.com/dsymbol">
#         <img alt="GitHub" src="https://img.shields.io/github/followers/dsymbol?style=social">
#     </a>
#     <a target="_blank" href="https://colab.research.google.com/#fileId=https://huggingface.co/spaces/dsymbol/whisper-webui/blob/main/notebook.ipynb">
#         <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
#     </a>
#     <a target="_blank" href="https://huggingface.co/spaces/dsymbol/whisper-webui" rel="noopener noreferrer"><img
#         src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Hugging Face Spaces">
#     </a>
# </div>
# """


def transcribe(recording, file, language, task):
    if recording and file:
        text = "Please only use one field."
    elif not recording and not file:
        text = "Please use one field."
    else:
        language = None if language == "Detect" else language
        filepath = file if file else recording
        text = model.transcribe(
            filepath, task=task.lower(), language=language, fp16=gpu
        )["text"].strip()
    return text


def interface(model_name="small"):
    global model
    model = whisper.load_model(model_name)

    return gr.Interface(
        fn=transcribe,
        inputs=[
            gr.Audio(label="Record", source="microphone", type="filepath"),
            gr.Audio(label="Upload", source="upload", type="filepath"),
            gr.Dropdown(
                label="Language",
                choices=["Detect"] + sorted([i.title() for i in LANGUAGES.values()]),
                value="Detect",
            ),
            gr.Dropdown(
                label="Task",
                choices=["Transcribe", "Translate"],
                value="Transcribe",
                info="Whether to perform X->X speech recognition or X->English translation",
            ),
        ],
        outputs=gr.Textbox(label="Transcription", lines=26),
        #theme=gr.themes.Default(),
        theme = gr.themes.Glass(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple),
        title="Whisper is listening to you",
        #description=DESCRIPTION,
        allow_flagging="never",
    )


if __name__ == "__main__":
    demo = interface()
    demo.queue().launch(debug=True)