Spaces:

szhang99
/

fire-coml-summer-2022

Runtime error

fire-coml-summer-2022

File size: 7,025 Bytes

# -*- coding: utf-8 -*-
"""CondensedModel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_
"""
# IMPORTANT
# pip install pydub
# pip install SpeechRecognition
# pip install gdown
# pip install ffmpeg

# -*- coding: utf-8 -*-

# IMPORTS
import gdown
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import librosa
import speech_recognition as sr
from os.path import exists
# MODEL LOSS
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

# BUILD MODEL
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model similar to DeepSpeech2."""
    # Model's input
    input_spectrogram = layers.Input((None, input_dim), name="input")
    # Expand the dimension to use 2D CNN.
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model

# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

# The set of characters accepted in the transcription.
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

# GET AND INSTANTIATE MODEL
model = build_model(
    input_dim = fft_length // 2 + 1,
    output_dim = char_to_num.vocabulary_size(),
    rnn_units = 512,
)

def loadWeights():
    # PATH TO CKPT
    ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing'

    # Set Output
    output = "AudioToTextCKPT.hdf5"

    # Download
    if not exists("AudioToTextCKPT.hdf5"):
        gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True)

    # Load CKPT to Model
    model.load_weights(output)
    
def load_wav(filename):
    wav,_ = librosa.load(filename, sr = 22050)

    audio = tf.convert_to_tensor(
        wav,
        dtype = tf.float32
        )
    
    audio = tf.reshape(
        audio,
        shape = [audio.shape[0], 1]
    )

    return audio

# A utility function to decode the output of the network
def decode_prediction(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

def getSpectro(wav_file):
    ###########################################
    ##  Process the Audio
    ##########################################
    audio = load_wav(wav_file)
    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    # 6. normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    spectrogram = np.expand_dims(spectrogram, axis = 0)

    return spectrogram

# Load Weights
loadWeights()

# CONVERT AUDIO TO TEXT
def AudioToTextUsingModel(wav_file):
    # Get Spectrogram
    spectro = getSpectro(wav_file)

    # Get Prediction
    pred = model.predict(spectro)

    # Get Output
    output_text = decode_prediction(pred)

    # Return Output
    return output_text

def AudioToTextUsingAPI(audio_file):
    AUDIO_FILE = audio_file
    
    # use the audio file as the audio source
    
    r = sr.Recognizer()
    
    with sr.AudioFile(AUDIO_FILE) as source:
        # reads the audio file. Here we use record instead of listen
        audio = r.record(source)  
    try:
        return r.recognize_google(audio)

    except sr.UnknownValueError:
        print(
            'Google Speech Recognition could not understand audio'
            )

    except sr.RequestError as e:
        print(
            'Could not request results from Google Speech Recognition service; {0}'.format(e)
            )