Spaces:

startharik
/

ms-asr-transducer

Build error

File size: 1,674 Bytes

import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import malaya_speech
from malaya_speech.utils.astype import float_to_int
from pyctcdecode import build_ctcdecoder
from malaya_speech.utils.char import CTC_VOCAB
from glob import glob
import kenlm
import gradio as gr
import logging
import json

logging.basicConfig(level=logging.INFO)

SR = 16000
MODELS = {}
AVAILABLE_MODELS = malaya_speech.stt.available_transducer().index.tolist()

wavs = glob('audio/*.wav')

def load_audio_wav(filename):
    print(filename)
    y, sr = malaya_speech.load(filename)
    return y, sr

def tts(upload, record, model):

    if record:
        rate, y = record
    else:
        y, rate = load_audio_wav(upload)

    if len(y.shape) == 2:
        y = y.T[0]

    N = len(y)
    print(y, N)

    y_16k = malaya_speech.resample(y, rate, SR)

    global MODELS
    if model not in MODELS:
        logging.info(f'{model} not in MODELS')
        MODELS[model] = malaya_speech.stt.deep_transducer(model=model)
    
    t = MODELS[model].greedy_decoder([y_16k])[0]
    return t

examples = []
for f in wavs:
    examples.append([f, None, 'conformer'])

demo = gr.Interface(
    fn=tts,
    inputs=[
        gr.Audio(source='upload', label = 'upload WAV file', type='filepath'),
        gr.Audio(source='microphone', label = 'or record using microphone'),
        gr.components.Dropdown(label='Available models', choices=AVAILABLE_MODELS, value = 'conformer'),
    ],
    outputs=['text'],
    examples=examples,
    cache_examples=False,
    title='ASR TRANSDUCER - TNB VOICE',
    description='Fastest'
)

demo.launch(server_name='0.0.0.0')