startharik's picture
Update app.py
44b82fc
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
import malaya_speech
from malaya_speech.utils.astype import float_to_int
from pyctcdecode import build_ctcdecoder
from malaya_speech.utils.char import CTC_VOCAB
from glob import glob
import kenlm
import gradio as gr
import logging
import json
logging.basicConfig(level=logging.INFO)
SR = 16000
MODELS = {}
AVAILABLE_MODELS = malaya_speech.stt.available_transducer().index.tolist()
wavs = glob('audio/*.wav')
def load_audio_wav(filename):
print(filename)
y, sr = malaya_speech.load(filename)
return y, sr
def tts(upload, record, model):
if record:
rate, y = record
else:
y, rate = load_audio_wav(upload)
if len(y.shape) == 2:
y = y.T[0]
N = len(y)
print(y, N)
y_16k = malaya_speech.resample(y, rate, SR)
global MODELS
if model not in MODELS:
logging.info(f'{model} not in MODELS')
MODELS[model] = malaya_speech.stt.deep_transducer(model=model)
t = MODELS[model].greedy_decoder([y_16k])[0]
return t
examples = []
for f in wavs:
examples.append([f, None, 'conformer'])
demo = gr.Interface(
fn=tts,
inputs=[
gr.Audio(source='upload', label = 'upload WAV file', type='filepath'),
gr.Audio(source='microphone', label = 'or record using microphone'),
gr.components.Dropdown(label='Available models', choices=AVAILABLE_MODELS, value = 'conformer'),
],
outputs=['text'],
examples=examples,
cache_examples=False,
title='ASR TRANSDUCER - TNB VOICE',
description='Fastest'
)
demo.launch(server_name='0.0.0.0')