import gradio as gr import numpy as np import tensorflow as tf from tensorflow import keras import tensorflow_io as tfio from huggingface_hub import from_pretrained_keras model = from_pretrained_keras("keras-io/ctc_asr", compile=False) characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] # Mapping characters to integers char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") # Mapping integers back to original characters num_to_char = keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) # An integer scalar Tensor. The window length in samples. frame_length = 256 # An integer scalar Tensor. The number of samples to step. frame_step = 160 # An integer scalar Tensor. The size of the FFT to apply. # If not provided, uses the smallest power of 2 enclosing frame_length. fft_length = 384 SAMPLE_RATE = 22050 def decode_batch_predictions(pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] # Iterate over the results and get back the text output_text = [] for result in results: result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") output_text.append(result) return output_text def load_16k_audio_wav(filename): # Read file content file_content = tf.io.read_file(filename) # Decode audio wave audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) audio_wav = tf.squeeze(audio_wav, axis=-1) sample_rate = tf.cast(sample_rate, dtype=tf.int64) # Resample to 16k audio_wav = tfio.audio.resample( audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE ) return audio_wav def mic_to_tensor(recorded_audio_file): sample_rate, audio = recorded_audio_file audio_wav = tf.constant(audio, dtype=tf.float32) if tf.rank(audio_wav) > 1: audio_wav = tf.reduce_mean(audio_wav, axis=1) audio_wav = tfio.audio.resample( audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE ) audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) return audio_wav def tensor_to_predictions(audio_tensor): # 3. Change type to float audio_tensor = tf.cast(audio_tensor, tf.float32) # 4. Get the spectrogram spectrogram = tf.signal.stft( audio_tensor, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, ) # 5. We only need the magnitude, which can be derived by applying tf.abs spectrogram = tf.abs(spectrogram) spectrogram = tf.math.pow(spectrogram, 0.5) # 6. normalisation means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) spectrogram = (spectrogram - means) / (stddevs + 1e-10) spectrogram = tf.expand_dims(spectrogram, axis=0) batch_predictions = model.predict(spectrogram) batch_predictions = decode_batch_predictions(batch_predictions) return batch_predictions def clear_inputs_and_outputs(): return [None, None, None] def predict(recorded_audio_file, uploaded_audio_file): # 1. Read wav file if recorded_audio_file: audio_tensor = mic_to_tensor(recorded_audio_file) else: audio_tensor = load_16k_audio_wav(uploaded_audio_file) prediction = tensor_to_predictions(audio_tensor)[0] return prediction # gr.Interface( # infer, # inputs=gr.Audio(source="microphone", type="filepath"), # outputs=gr.Textbox(lines=5, label="Input Text"), # #title=title, # #description=description, # #article=article, # #examples=examples, # enable_queue=True, # ).launch(debug=True) # Main function if __name__ == "__main__": demo = gr.Blocks() with demo: gr.Markdown( """

Automatic Speech Recognition using CTC

\ This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.
\ In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English

""" ) with gr.Row(): ## Input with gr.Column(): mic_input = gr.Audio(source="microphone", label="Record your own voice") upl_input = gr.Audio( source="upload", type="filepath", label="Upload a wav file" ) with gr.Row(): clr_btn = gr.Button(value="Clear", variant="secondary") prd_btn = gr.Button(value="Predict") # Outputs with gr.Column(): lbl_output = gr.Label(label="Text") # Credits with gr.Row(): gr.Markdown( """

Credits

Author: Anurag Singh.
Based on the following Keras example Automatic Speech Recognition using CTC by Mohamed Reda Bouadjenek and Ngoc Dung Huynh
Check out the model here """ ) clr_btn.click( fn=clear_inputs_and_outputs, inputs=[], outputs=[mic_input, upl_input, lbl_output], ) prd_btn.click( fn=predict, inputs=[mic_input, upl_input], outputs=[lbl_output], ) demo.launch(debug=True)