File size: 3,956 Bytes
75be352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fc9e2d
75be352
8d66594
7fc9e2d
48838d7
7fc9e2d
48838d7
7fc9e2d
 
 
75be352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40881eb
 
 
 
 
 
75be352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import logging
import sys
import gradio as gr
import vosk
import json
import subprocess

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

LARGE_MODEL_BY_LANGUAGE = {
    #"Russian": {"model_id": "vosk-model-ru-0.42"},
    "Chinese": {"model_id": "vosk-model-cn-0.22"},
    "English": {"model_id": "vosk-model-en-us-0.22"},
    #"French": {"model_id": "vosk-model-fr-0.22"},
    #"German": {"model_id": "vosk-model-de-0.21"},
    #"Italian": {"model_id": "vosk-model-it-0.22"},
    #"Japanese": {"model_id": "vosk-model-ja-0.22"},
    #"Hindi": {"model_id": "vosk-model-hi-0.22"},
    #"Persian": {"model_id": "vosk-model-fa-0.5"},
    #"Uzbek": {"model_id": "vosk-model-small-uz-0.22"},
}

LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
CACHED_MODELS_BY_ID = {}

def asr(model, input_file):

    rec = vosk.KaldiRecognizer(model, 16000.0)
    results = []

    process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(),
                            stdout=subprocess.PIPE)

    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            jres = json.loads(rec.Result())
            results.append(jres['text'])

    jres = json.loads(rec.FinalResult())
    results.append(jres['text'])

    return " ".join(results)


def run(input_file, language, history):

    logger.info(f"Running ASR for {language} for {input_file}")

    history = history or []

    model = LARGE_MODEL_BY_LANGUAGE.get(language, None)

    if model is None:
        history.append({
            "error_message": f"Failed to find a model for {language} language :("
        })
    elif input_file is None:
        history.append({
            "error_message": f"Record input audio first"
        })
    else:
        model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
        if model_instance is None:
            model_instance = vosk.Model(model_name=model["model_id"])
            CACHED_MODELS_BY_ID[model["model_id"]] = model_instance

        transcription = asr(model_instance, input_file)

        logger.info(f"Transcription for {input_file}: {transcription}")

        history.append({
            "model_id": model["model_id"],
            "language": language,
            "transcription": transcription,
            "error_message": None
        })

    html_output = "<div class='result'>"
    for item in history:
        if item["error_message"] is not None:
            html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
        else:
            html_output += "<div class='result_item result_item_success'>"
            html_output += f'{item["transcription"]}<br/>'
            html_output += "</div>"
    html_output += "</div>"

    return html_output, history

def loadModels():
    for k in LARGE_MODEL_BY_LANGUAGE:
        model = LARGE_MODEL_BY_LANGUAGE[k]
        vosk.Model(model_name=model["model_id"])

loadModels()

gr.Interface(
    run,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
        gr.inputs.Radio(label="Language", choices=LANGUAGES),
        "state"
    ],
    outputs=[
        gr.outputs.HTML(label="Outputs"),
        "state"
    ],
    title="Automatic Speech Recognition",
    description="",
    css="""
    .result {display:flex;flex-direction:column}
    .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
    .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
    .result_item_error {background-color:#ff7070;color:white;align-self:start}
    """,
    allow_flagging="never",
    theme="default"
).launch(enable_queue=True)