Spaces:
Runtime error
Runtime error
UrviiChauhan
Merge branch 'main' of https://github.com/umd-fire-coml/2022-summer-speech-translation
dce51dc
# -*- coding: utf-8 -*- | |
"""CondensedModel.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_ | |
""" | |
# IMPORTANT | |
# pip install pydub | |
# pip install SpeechRecognition | |
# pip install gdown | |
# pip install ffmpeg | |
# -*- coding: utf-8 -*- | |
# IMPORTS | |
import gdown | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow import keras | |
from keras import layers | |
import librosa | |
import speech_recognition as sr | |
from os.path import exists | |
# MODEL LOSS | |
def CTCLoss(y_true, y_pred): | |
# Compute the training-time loss value | |
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") | |
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") | |
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") | |
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") | |
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") | |
loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length) | |
return loss | |
# BUILD MODEL | |
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128): | |
"""Model similar to DeepSpeech2.""" | |
# Model's input | |
input_spectrogram = layers.Input((None, input_dim), name="input") | |
# Expand the dimension to use 2D CNN. | |
x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram) | |
# Convolution layer 1 | |
x = layers.Conv2D( | |
filters=32, | |
kernel_size=[11, 41], | |
strides=[2, 2], | |
padding="same", | |
use_bias=False, | |
name="conv_1", | |
)(x) | |
x = layers.BatchNormalization(name="conv_1_bn")(x) | |
x = layers.ReLU(name="conv_1_relu")(x) | |
# Convolution layer 2 | |
x = layers.Conv2D( | |
filters=32, | |
kernel_size=[11, 21], | |
strides=[1, 2], | |
padding="same", | |
use_bias=False, | |
name="conv_2", | |
)(x) | |
x = layers.BatchNormalization(name="conv_2_bn")(x) | |
x = layers.ReLU(name="conv_2_relu")(x) | |
# Reshape the resulted volume to feed the RNNs layers | |
x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x) | |
# RNN layers | |
for i in range(1, rnn_layers + 1): | |
recurrent = layers.GRU( | |
units=rnn_units, | |
activation="tanh", | |
recurrent_activation="sigmoid", | |
use_bias=True, | |
return_sequences=True, | |
reset_after=True, | |
name=f"gru_{i}", | |
) | |
x = layers.Bidirectional( | |
recurrent, name=f"bidirectional_{i}", merge_mode="concat" | |
)(x) | |
if i < rnn_layers: | |
x = layers.Dropout(rate=0.5)(x) | |
# Dense layer | |
x = layers.Dense(units=rnn_units * 2, name="dense_1")(x) | |
x = layers.ReLU(name="dense_1_relu")(x) | |
x = layers.Dropout(rate=0.5)(x) | |
# Classification layer | |
output = layers.Dense(units=output_dim + 1, activation="softmax")(x) | |
# Model | |
model = keras.Model(input_spectrogram, output, name="DeepSpeech_2") | |
# Optimizer | |
opt = keras.optimizers.Adam(learning_rate=1e-4) | |
# Compile the model and return | |
model.compile(optimizer=opt, loss=CTCLoss) | |
return model | |
# An integer scalar Tensor. The window length in samples. | |
frame_length = 256 | |
# An integer scalar Tensor. The number of samples to step. | |
frame_step = 160 | |
# An integer scalar Tensor. The size of the FFT to apply. | |
# If not provided, uses the smallest power of 2 enclosing frame_length. | |
fft_length = 384 | |
# The set of characters accepted in the transcription. | |
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] | |
# Mapping characters to integers | |
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") | |
# Mapping integers back to original characters | |
num_to_char = keras.layers.StringLookup( | |
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True | |
) | |
# GET AND INSTANTIATE MODEL | |
model = build_model( | |
input_dim = fft_length // 2 + 1, | |
output_dim = char_to_num.vocabulary_size(), | |
rnn_units = 512, | |
) | |
def loadWeights(): | |
# PATH TO CKPT | |
ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing' | |
# Set Output | |
output = "AudioToTextCKPT.hdf5" | |
# Download | |
if not exists("AudioToTextCKPT.hdf5"): | |
gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True) | |
# Load CKPT to Model | |
model.load_weights(output) | |
def load_wav(filename): | |
wav,_ = librosa.load(filename, sr = 22050) | |
audio = tf.convert_to_tensor( | |
wav, | |
dtype = tf.float32 | |
) | |
audio = tf.reshape( | |
audio, | |
shape = [audio.shape[0], 1] | |
) | |
return audio | |
# A utility function to decode the output of the network | |
def decode_prediction(pred): | |
input_len = np.ones(pred.shape[0]) * pred.shape[1] | |
# Use greedy search. For complex tasks, you can use beam search | |
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] | |
# Iterate over the results and get back the text | |
output_text = [] | |
for result in results: | |
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") | |
output_text.append(result) | |
return output_text | |
def getSpectro(wav_file): | |
########################################### | |
## Process the Audio | |
########################################## | |
audio = load_wav(wav_file) | |
audio = tf.squeeze(audio, axis=-1) | |
# 3. Change type to float | |
audio = tf.cast(audio, tf.float32) | |
# 4. Get the spectrogram | |
spectrogram = tf.signal.stft( | |
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length | |
) | |
# 5. We only need the magnitude, which can be derived by applying tf.abs | |
spectrogram = tf.abs(spectrogram) | |
spectrogram = tf.math.pow(spectrogram, 0.5) | |
# 6. normalisation | |
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) | |
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) | |
spectrogram = (spectrogram - means) / (stddevs + 1e-10) | |
spectrogram = np.expand_dims(spectrogram, axis = 0) | |
return spectrogram | |
# Load Weights | |
loadWeights() | |
# CONVERT AUDIO TO TEXT | |
def AudioToTextUsingModel(wav_file): | |
# Get Spectrogram | |
spectro = getSpectro(wav_file) | |
# Get Prediction | |
pred = model.predict(spectro) | |
# Get Output | |
output_text = decode_prediction(pred) | |
# Return Output | |
return output_text | |
def AudioToTextUsingAPI(audio_file): | |
AUDIO_FILE = audio_file | |
# use the audio file as the audio source | |
r = sr.Recognizer() | |
with sr.AudioFile(AUDIO_FILE) as source: | |
# reads the audio file. Here we use record instead of listen | |
audio = r.record(source) | |
try: | |
return r.recognize_google(audio) | |
except sr.UnknownValueError: | |
print( | |
'Google Speech Recognition could not understand audio' | |
) | |
except sr.RequestError as e: | |
print( | |
'Could not request results from Google Speech Recognition service; {0}'.format(e) | |
) | |