import os os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' import malaya_speech from malaya_speech.utils.astype import float_to_int from pyctcdecode import build_ctcdecoder from malaya_speech.utils.char import CTC_VOCAB from glob import glob import kenlm import gradio as gr import logging import json logging.basicConfig(level=logging.INFO) SR = 16000 MODELS = {} AVAILABLE_MODELS = malaya_speech.stt.available_transducer().index.tolist() wavs = glob('audio/*.wav') def load_audio_wav(filename): print(filename) y, sr = malaya_speech.load(filename) return y, sr def tts(upload, record, model): if record: rate, y = record else: y, rate = load_audio_wav(upload) if len(y.shape) == 2: y = y.T[0] N = len(y) print(y, N) y_16k = malaya_speech.resample(y, rate, SR) global MODELS if model not in MODELS: logging.info(f'{model} not in MODELS') MODELS[model] = malaya_speech.stt.deep_transducer(model=model) t = MODELS[model].greedy_decoder([y_16k])[0] return t examples = [] for f in wavs: examples.append([f, None, 'conformer']) demo = gr.Interface( fn=tts, inputs=[ gr.Audio(source='upload', label = 'upload WAV file', type='filepath'), gr.Audio(source='microphone', label = 'or record using microphone'), gr.components.Dropdown(label='Available models', choices=AVAILABLE_MODELS, value = 'conformer'), ], outputs=['text'], examples=examples, cache_examples=False, title='ASR TRANSDUCER - TNB VOICE', description='Fastest' ) demo.launch(server_name='0.0.0.0')