fire-coml-summer-2022 / AudioToText /condensedmodel.py
UrviiChauhan
Merge branch 'main' of https://github.com/umd-fire-coml/2022-summer-speech-translation
dce51dc
raw history blame
No virus
7.02 kB
# -*- coding: utf-8 -*-
"""CondensedModel.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_
"""
# IMPORTANT
# pip install pydub
# pip install SpeechRecognition
# pip install gdown
# pip install ffmpeg
# -*- coding: utf-8 -*-
# IMPORTS
import gdown
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import librosa
import speech_recognition as sr
from os.path import exists
# MODEL LOSS
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss
# BUILD MODEL
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
"""Model similar to DeepSpeech2."""
# Model's input
input_spectrogram = layers.Input((None, input_dim), name="input")
# Expand the dimension to use 2D CNN.
x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
# Convolution layer 1
x = layers.Conv2D(
filters=32,
kernel_size=[11, 41],
strides=[2, 2],
padding="same",
use_bias=False,
name="conv_1",
)(x)
x = layers.BatchNormalization(name="conv_1_bn")(x)
x = layers.ReLU(name="conv_1_relu")(x)
# Convolution layer 2
x = layers.Conv2D(
filters=32,
kernel_size=[11, 21],
strides=[1, 2],
padding="same",
use_bias=False,
name="conv_2",
)(x)
x = layers.BatchNormalization(name="conv_2_bn")(x)
x = layers.ReLU(name="conv_2_relu")(x)
# Reshape the resulted volume to feed the RNNs layers
x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
# RNN layers
for i in range(1, rnn_layers + 1):
recurrent = layers.GRU(
units=rnn_units,
activation="tanh",
recurrent_activation="sigmoid",
use_bias=True,
return_sequences=True,
reset_after=True,
name=f"gru_{i}",
)
x = layers.Bidirectional(
recurrent, name=f"bidirectional_{i}", merge_mode="concat"
)(x)
if i < rnn_layers:
x = layers.Dropout(rate=0.5)(x)
# Dense layer
x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
x = layers.ReLU(name="dense_1_relu")(x)
x = layers.Dropout(rate=0.5)(x)
# Classification layer
output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
# Model
model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
# Optimizer
opt = keras.optimizers.Adam(learning_rate=1e-4)
# Compile the model and return
model.compile(optimizer=opt, loss=CTCLoss)
return model
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384
# The set of characters accepted in the transcription.
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
# GET AND INSTANTIATE MODEL
model = build_model(
input_dim = fft_length // 2 + 1,
output_dim = char_to_num.vocabulary_size(),
rnn_units = 512,
)
def loadWeights():
# PATH TO CKPT
ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing'
# Set Output
output = "AudioToTextCKPT.hdf5"
# Download
if not exists("AudioToTextCKPT.hdf5"):
gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True)
# Load CKPT to Model
model.load_weights(output)
def load_wav(filename):
wav,_ = librosa.load(filename, sr = 22050)
audio = tf.convert_to_tensor(
wav,
dtype = tf.float32
)
audio = tf.reshape(
audio,
shape = [audio.shape[0], 1]
)
return audio
# A utility function to decode the output of the network
def decode_prediction(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
# Iterate over the results and get back the text
output_text = []
for result in results:
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
output_text.append(result)
return output_text
def getSpectro(wav_file):
###########################################
## Process the Audio
##########################################
audio = load_wav(wav_file)
audio = tf.squeeze(audio, axis=-1)
# 3. Change type to float
audio = tf.cast(audio, tf.float32)
# 4. Get the spectrogram
spectrogram = tf.signal.stft(
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
)
# 5. We only need the magnitude, which can be derived by applying tf.abs
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
# 6. normalisation
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
spectrogram = np.expand_dims(spectrogram, axis = 0)
return spectrogram
# Load Weights
loadWeights()
# CONVERT AUDIO TO TEXT
def AudioToTextUsingModel(wav_file):
# Get Spectrogram
spectro = getSpectro(wav_file)
# Get Prediction
pred = model.predict(spectro)
# Get Output
output_text = decode_prediction(pred)
# Return Output
return output_text
def AudioToTextUsingAPI(audio_file):
AUDIO_FILE = audio_file
# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE) as source:
# reads the audio file. Here we use record instead of listen
audio = r.record(source)
try:
return r.recognize_google(audio)
except sr.UnknownValueError:
print(
'Google Speech Recognition could not understand audio'
)
except sr.RequestError as e:
print(
'Could not request results from Google Speech Recognition service; {0}'.format(e)
)