Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
# IMPORTS | |
import os | |
import numpy as np | |
import requests | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras import layers | |
# MODEL STUFF | |
# The set of characters accepted in the transcription. | |
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] | |
# Mapping characters to integers | |
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") | |
# Mapping integers back to original characters | |
num_to_char = keras.layers.StringLookup( | |
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True | |
) | |
# An integer scalar Tensor. The window length in samples. | |
frame_length = 256 | |
# An integer scalar Tensor. The number of samples to step. | |
frame_step = 160 | |
# An integer scalar Tensor. The size of the FFT to apply. | |
# If not provided, uses the smallest power of 2 enclosing frame_length. | |
fft_length = 384 | |
# MODEL LOSS | |
def CTCLoss(y_true, y_pred): | |
# Compute the training-time loss value | |
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") | |
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") | |
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") | |
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") | |
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") | |
loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length) | |
return loss | |
# BUILD MODEL | |
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128): | |
"""Model similar to DeepSpeech2.""" | |
# Model's input | |
input_spectrogram = layers.Input((None, input_dim), name="input") | |
# Expand the dimension to use 2D CNN. | |
x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram) | |
# Convolution layer 1 | |
x = layers.Conv2D( | |
filters=32, | |
kernel_size=[11, 41], | |
strides=[2, 2], | |
padding="same", | |
use_bias=False, | |
name="conv_1", | |
)(x) | |
x = layers.BatchNormalization(name="conv_1_bn")(x) | |
x = layers.ReLU(name="conv_1_relu")(x) | |
# Convolution layer 2 | |
x = layers.Conv2D( | |
filters=32, | |
kernel_size=[11, 21], | |
strides=[1, 2], | |
padding="same", | |
use_bias=False, | |
name="conv_2", | |
)(x) | |
x = layers.BatchNormalization(name="conv_2_bn")(x) | |
x = layers.ReLU(name="conv_2_relu")(x) | |
# Reshape the resulted volume to feed the RNNs layers | |
x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x) | |
# RNN layers | |
for i in range(1, rnn_layers + 1): | |
recurrent = layers.GRU( | |
units=rnn_units, | |
activation="tanh", | |
recurrent_activation="sigmoid", | |
use_bias=True, | |
return_sequences=True, | |
reset_after=True, | |
name=f"gru_{i}", | |
) | |
x = layers.Bidirectional( | |
recurrent, name=f"bidirectional_{i}", merge_mode="concat" | |
)(x) | |
if i < rnn_layers: | |
x = layers.Dropout(rate=0.5)(x) | |
# Dense layer | |
x = layers.Dense(units=rnn_units * 2, name="dense_1")(x) | |
x = layers.ReLU(name="dense_1_relu")(x) | |
x = layers.Dropout(rate=0.5)(x) | |
# Classification layer | |
output = layers.Dense(units=output_dim + 1, activation="softmax")(x) | |
# Model | |
model = keras.Model(input_spectrogram, output, name="DeepSpeech_2") | |
# Optimizer | |
opt = keras.optimizers.Adam(learning_rate=1e-4) | |
# Compile the model and return | |
model.compile(optimizer=opt, loss=CTCLoss) | |
return model | |
# GET AND INSTANTIATE MODEL | |
model = build_model( | |
input_dim = fft_length // 2 + 1, | |
output_dim = char_to_num.vocabulary_size(), | |
rnn_units = 512, | |
) | |
# GET TEXT FROM MODEL PREDICTION | |
# A utility function to decode the output of the network | |
def decode_batch_predictions(pred): | |
input_len = np.ones(pred.shape[0]) * pred.shape[1] | |
# Use greedy search. For complex tasks, you can use beam search | |
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] | |
# Iterate over the results and get back the text | |
output_text = [] | |
for result in results: | |
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") | |
output_text.append(result) | |
return output_text | |
# PATH TO CKPT | |
# google share link | |
ckpt_link = 'https://drive.google.com/file/d/14mT_wJMuIqUEJSS12aAc6bnPCjYuLWGf/view?usp=sharing' | |
# Define the local filename to save data | |
local_file = 'AudioToTextCKPT.hdf5' | |
# Make http request for remote file data | |
data = requests.get(ckpt_link) | |
# Save file data to local copy | |
with open(local_file, 'wb')as file: | |
file.write(data.content) | |
ckpt = local_file | |
# LOAD CKPT TO MODEL | |
model.load_weights(ckpt) | |
# CONVERT AUDIO TO TEXT | |
def AudioToText(wav_file): | |
########################################### | |
## Process the Audio | |
########################################## | |
# 1. Read wav file | |
file = tf.io.read_file(wav_file) | |
# 2. Decode the wav file | |
audio, _ = tf.audio.decode_wav(file) | |
audio = tf.squeeze(audio, axis=-1) | |
# 3. Change type to float | |
audio = tf.cast(audio, tf.float32) | |
# 4. Get the spectrogram | |
spectrogram = tf.signal.stft( | |
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length | |
) | |
# 5. We only need the magnitude, which can be derived by applying tf.abs | |
spectrogram = tf.abs(spectrogram) | |
spectrogram = tf.math.pow(spectrogram, 0.5) | |
# 6. normalisation | |
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) | |
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) | |
spectrogram = (spectrogram - means) / (stddevs + 1e-10) | |
pred = model.predict(spectrogram) | |
output_text = decode_batch_predictions(pred) | |
return output_text | |
# testing model | |
print(AudioToText('testWav.wav')) | |