|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import time |
|
from datetime import datetime |
|
|
|
import gradio as gr |
|
import torchaudio |
|
|
|
from model import ( |
|
get_gigaspeech_pre_trained_model, |
|
sample_rate, |
|
get_wenetspeech_pre_trained_model, |
|
) |
|
|
|
models = { |
|
"Chinese": get_wenetspeech_pre_trained_model(), |
|
"English": get_gigaspeech_pre_trained_model(), |
|
} |
|
|
|
|
|
def convert_to_wav(in_filename: str) -> str: |
|
"""Convert the input audio file to a wave file""" |
|
out_filename = in_filename + ".wav" |
|
print(f"Converting '{in_filename}' to '{out_filename}'") |
|
_ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'") |
|
return out_filename |
|
|
|
|
|
demo = gr.Blocks() |
|
|
|
|
|
def process(in_filename: str, language: str) -> str: |
|
print("in_filename", in_filename) |
|
print("language", language) |
|
filename = convert_to_wav(in_filename) |
|
|
|
now = datetime.now() |
|
date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f") |
|
print(f"Started at {date_time}") |
|
|
|
start = time.time() |
|
wave, wave_sample_rate = torchaudio.load(filename) |
|
|
|
if wave_sample_rate != sample_rate: |
|
print( |
|
f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. " |
|
f"Resampling to {sample_rate}." |
|
) |
|
|
|
wave = torchaudio.functional.resample( |
|
wave, |
|
orig_freq=wave_sample_rate, |
|
new_freq=sample_rate, |
|
) |
|
wave = wave[0] |
|
|
|
hyp = models[language].decode_waves([wave])[0] |
|
|
|
date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f") |
|
end = time.time() |
|
|
|
duration = wave.shape[0] / sample_rate |
|
rtf = (end - start) / duration |
|
|
|
print(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s") |
|
print(f"Duration {duration: .3f} s") |
|
print(f"RTF {rtf: .3f}") |
|
print("hyp") |
|
print(hyp) |
|
|
|
return hyp |
|
|
|
|
|
with demo: |
|
gr.Markdown("Upload audio from disk or record from microphone for recognition") |
|
language_choices = list(models.keys()) |
|
language = gr.inputs.Radio( |
|
label="Language", |
|
choices=language_choices, |
|
value=language_choices[0], |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Upload from disk"): |
|
uploaded_file = gr.inputs.Audio( |
|
source="upload", |
|
type="filepath", |
|
optional=False, |
|
label="Upload from disk", |
|
) |
|
upload_button = gr.Button("Submit for recognition") |
|
uploaded_output = gr.outputs.Textbox( |
|
label="Recognized speech from uploaded file" |
|
) |
|
|
|
with gr.TabItem("Record from microphone"): |
|
microphone = gr.inputs.Audio( |
|
source="microphone", |
|
type="filepath", |
|
optional=False, |
|
label="Record from microphone", |
|
) |
|
recorded_output = gr.outputs.Textbox( |
|
label="Recognized speech from recordings" |
|
) |
|
|
|
record_button = gr.Button("Submit for recordings") |
|
|
|
upload_button.click( |
|
process, |
|
inputs=[uploaded_file, language], |
|
outputs=uploaded_output, |
|
) |
|
record_button.click( |
|
process, |
|
inputs=[microphone, language], |
|
outputs=recorded_output, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|