File size: 7,024 Bytes
d8813e5
 
 
 
 
 
 
 
 
f038278
 
 
7e88d59
d8813e5
a210e7f
 
 
d8813e5
a210e7f
 
 
f038278
d8813e5
 
f666610
a210e7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8813e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a210e7f
 
 
 
 
 
 
d8813e5
 
 
 
 
 
 
 
f666610
 
d8813e5
 
 
97a3fb1
d8813e5
a194ba7
d8813e5
 
 
 
 
 
 
 
 
 
 
 
a210e7f
 
d8813e5
a210e7f
 
 
 
 
 
 
 
 
 
d8813e5
a210e7f
 
 
d8813e5
a210e7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e88d59
d8813e5
 
 
c257f88
 
7e88d59
d8813e5
 
 
 
a210e7f
d8813e5
 
 
 
 
a210e7f
d8813e5
 
a210e7f
d8813e5
a194ba7
d8813e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f038278
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# -*- coding: utf-8 -*-
"""CondensedModel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_
"""
# IMPORTANT
# pip install pydub
# pip install SpeechRecognition
# pip install gdown
# pip install ffmpeg

# -*- coding: utf-8 -*-

# IMPORTS
import gdown
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import librosa
import speech_recognition as sr
from os.path import exists
# MODEL LOSS
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

# BUILD MODEL
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model similar to DeepSpeech2."""
    # Model's input
    input_spectrogram = layers.Input((None, input_dim), name="input")
    # Expand the dimension to use 2D CNN.
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model

# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

# The set of characters accepted in the transcription.
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

# GET AND INSTANTIATE MODEL
model = build_model(
    input_dim = fft_length // 2 + 1,
    output_dim = char_to_num.vocabulary_size(),
    rnn_units = 512,
)

def loadWeights():
    # PATH TO CKPT
    ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing'

    # Set Output
    output = "AudioToTextCKPT.hdf5"

    # Download
    if not exists("AudioToTextCKPT.hdf5"):
        gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True)

    # Load CKPT to Model
    model.load_weights(output)
    
def load_wav(filename):
    wav,_ = librosa.load(filename, sr = 22050)

    audio = tf.convert_to_tensor(
        wav,
        dtype = tf.float32
        )
    
    audio = tf.reshape(
        audio,
        shape = [audio.shape[0], 1]
    )

    return audio

# A utility function to decode the output of the network
def decode_prediction(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

def getSpectro(wav_file):
    ###########################################
    ##  Process the Audio
    ##########################################
    audio = load_wav(wav_file)
    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    # 6. normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    spectrogram = np.expand_dims(spectrogram, axis = 0)

    return spectrogram

# Load Weights
loadWeights()

# CONVERT AUDIO TO TEXT
def AudioToTextUsingModel(wav_file):
    # Get Spectrogram
    spectro = getSpectro(wav_file)

    # Get Prediction
    pred = model.predict(spectro)

    # Get Output
    output_text = decode_prediction(pred)

    # Return Output
    return output_text

def AudioToTextUsingAPI(audio_file):
    AUDIO_FILE = audio_file
    
    # use the audio file as the audio source
    
    r = sr.Recognizer()
    
    with sr.AudioFile(AUDIO_FILE) as source:
        # reads the audio file. Here we use record instead of listen
        audio = r.record(source)  
    try:
        return r.recognize_google(audio)

    except sr.UnknownValueError:
        print(
            'Google Speech Recognition could not understand audio'
            )

    except sr.RequestError as e:
        print(
            'Could not request results from Google Speech Recognition service; {0}'.format(e)
            )