File size: 3,786 Bytes
196d65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f88e21a
196d65f
 
 
 
 
 
d0af541
196d65f
059dc0c
196d65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da3b59
 
 
 
196d65f
 
 
 
 
 
9da3b59
196d65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da3b59
196d65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0858ed5
196d65f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import logging
import sys
import gradio as gr
import vosk
import json
import subprocess

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

LARGE_MODEL_BY_LANGUAGE = {
    "Russian": {"model_id": "vosk-model-ru-0.42"},
    "Chinese": {"model_id": "vosk-model-cn-0.22"},
    "English": {"model_id": "vosk-model-en-us-0.22"},
    "French": {"model_id": "vosk-model-fr-0.22"},
    "German": {"model_id": "vosk-model-de-0.22"},
    "Italian": {"model_id": "vosk-model-it-0.22"},
    "Japanese": {"model_id": "vosk-model-ja-0.22"},
    "Hindi": {"model_id": "vosk-model-hi-0.22"},
    "Persian": {"model_id": "vosk-model-fa-0.5"},
    "Uzbek": {"model_id": "vosk-model-small-uz-0.22"},
}

LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
CACHED_MODELS_BY_ID = {}

def asr(model, input_file):

    rec = vosk.KaldiRecognizer(model, 16000.0)
    results = []

    process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(),
                            stdout=subprocess.PIPE)

    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            jres = json.loads(rec.Result())
            results.append(jres['text'])

    jres = json.loads(rec.FinalResult())
    results.append(jres['text'])

    return " ".join(results)


def run(input_file, language, history):

    logger.info(f"Running ASR for {language} for {input_file}")

    history = history or []

    model = LARGE_MODEL_BY_LANGUAGE.get(language, None)

    if model is None:
        history.append({
            "error_message": f"Failed to find a model for {language} language :("
        })
    elif input_file is None:
        history.append({
            "error_message": f"Record input audio first"
        })
    else:
        model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
        if model_instance is None:
            model_instance = vosk.Model(model_name=model["model_id"])
            CACHED_MODELS_BY_ID[model["model_id"]] = model_instance

        transcription = asr(model_instance, input_file)

        logger.info(f"Transcription for {input_file}: {transcription}")

        history.append({
            "model_id": model["model_id"],
            "language": language,
            "transcription": transcription,
            "error_message": None
        })

    html_output = "<div class='result'>"
    for item in history:
        if item["error_message"] is not None:
            html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
        else:
            html_output += "<div class='result_item result_item_success'>"
            html_output += f'{item["transcription"]}<br/>'
            html_output += "</div>"
    html_output += "</div>"

    return html_output, history


gr.Interface(
    run,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
        gr.inputs.Radio(label="Language", choices=LANGUAGES),
        "state"
    ],
    outputs=[
        gr.outputs.HTML(label="Outputs"),
        "state"
    ],
    title="Automatic Speech Recognition",
    description="",
    css="""
    .result {display:flex;flex-direction:column}
    .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
    .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
    .result_item_error {background-color:#ff7070;color:white;align-self:start}
    """,
    allow_flagging="never",
    theme="default"
).launch(enable_queue=True)