Spaces:

szhang99
/

fire-coml-summer-2022

Runtime error

fire-coml-summer-2022 / AudioToText /condensedmodel.py

UrviiChauhan

Merge branch 'main' of https://github.com/umd-fire-coml/2022-summer-speech-translation

dce51dc almost 2 years ago

No virus

7.02 kB

	# -- coding: utf-8 --
	"""CondensedModel.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1d8zn9Gvp8xlKS2GRer_xzLWtm3InIBE_
	"""
	# IMPORTANT
	# pip install pydub
	# pip install SpeechRecognition
	# pip install gdown
	# pip install ffmpeg

	# -- coding: utf-8 --

	# IMPORTS
	import gdown
	import numpy as np
	import tensorflow as tf
	from tensorflow import keras
	from keras import layers
	import librosa
	import speech_recognition as sr
	from os.path import exists
	# MODEL LOSS
	def CTCLoss(y_true, y_pred):
	# Compute the training-time loss value
	batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
	input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
	label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

	input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
	label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

	loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
	return loss

	# BUILD MODEL
	def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
	"""Model similar to DeepSpeech2."""
	# Model's input
	input_spectrogram = layers.Input((None, input_dim), name="input")
	# Expand the dimension to use 2D CNN.
	x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
	# Convolution layer 1
	x = layers.Conv2D(
	filters=32,
	kernel_size=[11, 41],
	strides=[2, 2],
	padding="same",
	use_bias=False,
	name="conv_1",
	)(x)
	x = layers.BatchNormalization(name="conv_1_bn")(x)
	x = layers.ReLU(name="conv_1_relu")(x)
	# Convolution layer 2
	x = layers.Conv2D(
	filters=32,
	kernel_size=[11, 21],
	strides=[1, 2],
	padding="same",
	use_bias=False,
	name="conv_2",
	)(x)
	x = layers.BatchNormalization(name="conv_2_bn")(x)
	x = layers.ReLU(name="conv_2_relu")(x)
	# Reshape the resulted volume to feed the RNNs layers
	x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
	# RNN layers
	for i in range(1, rnn_layers + 1):
	recurrent = layers.GRU(
	units=rnn_units,
	activation="tanh",
	recurrent_activation="sigmoid",
	use_bias=True,
	return_sequences=True,
	reset_after=True,
	name=f"gru_{i}",
	)
	x = layers.Bidirectional(
	recurrent, name=f"bidirectional_{i}", merge_mode="concat"
	)(x)
	if i < rnn_layers:
	x = layers.Dropout(rate=0.5)(x)
	# Dense layer
	x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
	x = layers.ReLU(name="dense_1_relu")(x)
	x = layers.Dropout(rate=0.5)(x)
	# Classification layer
	output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
	# Model
	model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
	# Optimizer
	opt = keras.optimizers.Adam(learning_rate=1e-4)
	# Compile the model and return
	model.compile(optimizer=opt, loss=CTCLoss)
	return model

	# An integer scalar Tensor. The window length in samples.
	frame_length = 256
	# An integer scalar Tensor. The number of samples to step.
	frame_step = 160
	# An integer scalar Tensor. The size of the FFT to apply.
	# If not provided, uses the smallest power of 2 enclosing frame_length.
	fft_length = 384

	# The set of characters accepted in the transcription.
	characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
	# Mapping characters to integers
	char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
	# Mapping integers back to original characters
	num_to_char = keras.layers.StringLookup(
	vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
	)

	# GET AND INSTANTIATE MODEL
	model = build_model(
	input_dim = fft_length // 2 + 1,
	output_dim = char_to_num.vocabulary_size(),
	rnn_units = 512,
	)

	def loadWeights():
	# PATH TO CKPT
	ckpt_link = 'https://drive.google.com/file/d/1-300ZyFUvBh1VYWyUTXhrJ9hxAJAQQcy/view?usp=sharing'

	# Set Output
	output = "AudioToTextCKPT.hdf5"

	# Download
	if not exists("AudioToTextCKPT.hdf5"):
	gdown.download(url = ckpt_link, output = output, quiet = False, fuzzy = True)

	# Load CKPT to Model
	model.load_weights(output)

	def load_wav(filename):
	wav,_ = librosa.load(filename, sr = 22050)

	audio = tf.convert_to_tensor(
	wav,
	dtype = tf.float32
	)

	audio = tf.reshape(
	audio,
	shape = [audio.shape[0], 1]
	)

	return audio

	# A utility function to decode the output of the network
	def decode_prediction(pred):
	input_len = np.ones(pred.shape[0]) * pred.shape[1]
	# Use greedy search. For complex tasks, you can use beam search
	results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
	# Iterate over the results and get back the text
	output_text = []
	for result in results:
	result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
	output_text.append(result)
	return output_text

	def getSpectro(wav_file):
	###########################################
	## Process the Audio
	##########################################
	audio = load_wav(wav_file)
	audio = tf.squeeze(audio, axis=-1)
	# 3. Change type to float
	audio = tf.cast(audio, tf.float32)
	# 4. Get the spectrogram
	spectrogram = tf.signal.stft(
	audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
	)
	# 5. We only need the magnitude, which can be derived by applying tf.abs
	spectrogram = tf.abs(spectrogram)
	spectrogram = tf.math.pow(spectrogram, 0.5)
	# 6. normalisation
	means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
	stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
	spectrogram = (spectrogram - means) / (stddevs + 1e-10)

	spectrogram = np.expand_dims(spectrogram, axis = 0)

	return spectrogram

	# Load Weights
	loadWeights()

	# CONVERT AUDIO TO TEXT
	def AudioToTextUsingModel(wav_file):
	# Get Spectrogram
	spectro = getSpectro(wav_file)

	# Get Prediction
	pred = model.predict(spectro)

	# Get Output
	output_text = decode_prediction(pred)

	# Return Output
	return output_text

	def AudioToTextUsingAPI(audio_file):
	AUDIO_FILE = audio_file

	# use the audio file as the audio source

	r = sr.Recognizer()

	with sr.AudioFile(AUDIO_FILE) as source:
	# reads the audio file. Here we use record instead of listen
	audio = r.record(source)
	try:
	return r.recognize_google(audio)

	except sr.UnknownValueError:
	print(
	'Google Speech Recognition could not understand audio'
	)

	except sr.RequestError as e:
	print(
	'Could not request results from Google Speech Recognition service; {0}'.format(e)
	)