Spaces:
Runtime error
Runtime error
File size: 7,025 Bytes
d8813e5 f038278 7e88d59 d8813e5 a210e7f d8813e5 a210e7f f038278 d8813e5 f666610 a210e7f d8813e5 a210e7f d8813e5 f666610 d8813e5 97a3fb1 d8813e5 a194ba7 d8813e5 a210e7f d8813e5 a210e7f d8813e5 a210e7f d8813e5 a210e7f 7e88d59 d8813e5 c257f88 7e88d59 d8813e5 a210e7f d8813e5 a210e7f d8813e5 a210e7f d8813e5 a194ba7 d8813e5 f038278 383e75c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# -*- coding: utf-8 -*-
"""CondensedModel.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_
"""
# IMPORTANT
# pip install pydub
# pip install SpeechRecognition
# pip install gdown
# pip install ffmpeg
# -*- coding: utf-8 -*-
# IMPORTS
import gdown
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import librosa
import speech_recognition as sr
from os.path import exists
# MODEL LOSS
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss
# BUILD MODEL
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
"""Model similar to DeepSpeech2."""
# Model's input
input_spectrogram = layers.Input((None, input_dim), name="input")
# Expand the dimension to use 2D CNN.
x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
# Convolution layer 1
x = layers.Conv2D(
filters=32,
kernel_size=[11, 41],
strides=[2, 2],
padding="same",
use_bias=False,
name="conv_1",
)(x)
x = layers.BatchNormalization(name="conv_1_bn")(x)
x = layers.ReLU(name="conv_1_relu")(x)
# Convolution layer 2
x = layers.Conv2D(
filters=32,
kernel_size=[11, 21],
strides=[1, 2],
padding="same",
use_bias=False,
name="conv_2",
)(x)
x = layers.BatchNormalization(name="conv_2_bn")(x)
x = layers.ReLU(name="conv_2_relu")(x)
# Reshape the resulted volume to feed the RNNs layers
x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
# RNN layers
for i in range(1, rnn_layers + 1):
recurrent = layers.GRU(
units=rnn_units,
activation="tanh",
recurrent_activation="sigmoid",
use_bias=True,
return_sequences=True,
reset_after=True,
name=f"gru_{i}",
)
x = layers.Bidirectional(
recurrent, name=f"bidirectional_{i}", merge_mode="concat"
)(x)
if i < rnn_layers:
x = layers.Dropout(rate=0.5)(x)
# Dense layer
x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
x = layers.ReLU(name="dense_1_relu")(x)
x = layers.Dropout(rate=0.5)(x)
# Classification layer
output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
# Model
model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
# Optimizer
opt = keras.optimizers.Adam(learning_rate=1e-4)
# Compile the model and return
model.compile(optimizer=opt, loss=CTCLoss)
return model
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384
# The set of characters accepted in the transcription.
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
# GET AND INSTANTIATE MODEL
model = build_model(
input_dim = fft_length // 2 + 1,
output_dim = char_to_num.vocabulary_size(),
rnn_units = 512,
)
def loadWeights():
# PATH TO CKPT
ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing'
# Set Output
output = "AudioToTextCKPT.hdf5"
# Download
if not exists("AudioToTextCKPT.hdf5"):
gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True)
# Load CKPT to Model
model.load_weights(output)
def load_wav(filename):
wav,_ = librosa.load(filename, sr = 22050)
audio = tf.convert_to_tensor(
wav,
dtype = tf.float32
)
audio = tf.reshape(
audio,
shape = [audio.shape[0], 1]
)
return audio
# A utility function to decode the output of the network
def decode_prediction(pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
# Iterate over the results and get back the text
output_text = []
for result in results:
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
output_text.append(result)
return output_text
def getSpectro(wav_file):
###########################################
## Process the Audio
##########################################
audio = load_wav(wav_file)
audio = tf.squeeze(audio, axis=-1)
# 3. Change type to float
audio = tf.cast(audio, tf.float32)
# 4. Get the spectrogram
spectrogram = tf.signal.stft(
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
)
# 5. We only need the magnitude, which can be derived by applying tf.abs
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
# 6. normalisation
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
spectrogram = np.expand_dims(spectrogram, axis = 0)
return spectrogram
# Load Weights
loadWeights()
# CONVERT AUDIO TO TEXT
def AudioToTextUsingModel(wav_file):
# Get Spectrogram
spectro = getSpectro(wav_file)
# Get Prediction
pred = model.predict(spectro)
# Get Output
output_text = decode_prediction(pred)
# Return Output
return output_text
def AudioToTextUsingAPI(audio_file):
AUDIO_FILE = audio_file
# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE) as source:
# reads the audio file. Here we use record instead of listen
audio = r.record(source)
try:
return r.recognize_google(audio)
except sr.UnknownValueError:
print(
'Google Speech Recognition could not understand audio'
)
except sr.RequestError as e:
print(
'Could not request results from Google Speech Recognition service; {0}'.format(e)
)
|