Spaces:

szhang99
/

fire-coml-summer-2022

Runtime error

fire-coml-summer-2022 / AudioToText /audiotospeech.py

Steven Zhang

reset commit

a210e7f over 2 years ago

5.78 kB

	# -- coding: utf-8 --

	# IMPORTS
	import os
	import numpy as np
	import requests
	import tensorflow as tf
	from tensorflow import keras
	from tensorflow.keras import layers

	# MODEL STUFF
	# The set of characters accepted in the transcription.
	characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
	# Mapping characters to integers
	char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
	# Mapping integers back to original characters
	num_to_char = keras.layers.StringLookup(
	vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
	)


	# An integer scalar Tensor. The window length in samples.
	frame_length = 256
	# An integer scalar Tensor. The number of samples to step.
	frame_step = 160
	# An integer scalar Tensor. The size of the FFT to apply.
	# If not provided, uses the smallest power of 2 enclosing frame_length.
	fft_length = 384

	# MODEL LOSS
	def CTCLoss(y_true, y_pred):
	# Compute the training-time loss value
	batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
	input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
	label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

	input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
	label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

	loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
	return loss

	# BUILD MODEL
	def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
	"""Model similar to DeepSpeech2."""
	# Model's input
	input_spectrogram = layers.Input((None, input_dim), name="input")
	# Expand the dimension to use 2D CNN.
	x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
	# Convolution layer 1
	x = layers.Conv2D(
	filters=32,
	kernel_size=[11, 41],
	strides=[2, 2],
	padding="same",
	use_bias=False,
	name="conv_1",
	)(x)
	x = layers.BatchNormalization(name="conv_1_bn")(x)
	x = layers.ReLU(name="conv_1_relu")(x)
	# Convolution layer 2
	x = layers.Conv2D(
	filters=32,
	kernel_size=[11, 21],
	strides=[1, 2],
	padding="same",
	use_bias=False,
	name="conv_2",
	)(x)
	x = layers.BatchNormalization(name="conv_2_bn")(x)
	x = layers.ReLU(name="conv_2_relu")(x)
	# Reshape the resulted volume to feed the RNNs layers
	x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
	# RNN layers
	for i in range(1, rnn_layers + 1):
	recurrent = layers.GRU(
	units=rnn_units,
	activation="tanh",
	recurrent_activation="sigmoid",
	use_bias=True,
	return_sequences=True,
	reset_after=True,
	name=f"gru_{i}",
	)
	x = layers.Bidirectional(
	recurrent, name=f"bidirectional_{i}", merge_mode="concat"
	)(x)
	if i < rnn_layers:
	x = layers.Dropout(rate=0.5)(x)
	# Dense layer
	x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
	x = layers.ReLU(name="dense_1_relu")(x)
	x = layers.Dropout(rate=0.5)(x)
	# Classification layer
	output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
	# Model
	model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
	# Optimizer
	opt = keras.optimizers.Adam(learning_rate=1e-4)
	# Compile the model and return
	model.compile(optimizer=opt, loss=CTCLoss)
	return model

	# GET AND INSTANTIATE MODEL
	model = build_model(
	input_dim = fft_length // 2 + 1,
	output_dim = char_to_num.vocabulary_size(),
	rnn_units = 512,
	)


	# GET TEXT FROM MODEL PREDICTION
	# A utility function to decode the output of the network
	def decode_batch_predictions(pred):
	input_len = np.ones(pred.shape[0]) * pred.shape[1]
	# Use greedy search. For complex tasks, you can use beam search
	results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
	# Iterate over the results and get back the text
	output_text = []
	for result in results:
	result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
	output_text.append(result)
	return output_text


	# PATH TO CKPT
	# google share link
	ckpt_link = 'https://drive.google.com/file/d/14mT_wJMuIqUEJSS12aAc6bnPCjYuLWGf/view?usp=sharing'

	# Define the local filename to save data
	local_file = 'AudioToTextCKPT.hdf5'

	# Make http request for remote file data
	data = requests.get(ckpt_link)

	# Save file data to local copy
	with open(local_file, 'wb')as file:
	file.write(data.content)

	ckpt = local_file


	# LOAD CKPT TO MODEL
	model.load_weights(ckpt)

	# CONVERT AUDIO TO TEXT
	def AudioToText(wav_file):
	###########################################
	## Process the Audio
	##########################################
	# 1. Read wav file
	file = tf.io.read_file(wav_file)
	# 2. Decode the wav file
	audio, _ = tf.audio.decode_wav(file)
	audio = tf.squeeze(audio, axis=-1)
	# 3. Change type to float
	audio = tf.cast(audio, tf.float32)
	# 4. Get the spectrogram
	spectrogram = tf.signal.stft(
	audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
	)
	# 5. We only need the magnitude, which can be derived by applying tf.abs
	spectrogram = tf.abs(spectrogram)
	spectrogram = tf.math.pow(spectrogram, 0.5)
	# 6. normalisation
	means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
	stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
	spectrogram = (spectrogram - means) / (stddevs + 1e-10)

	pred = model.predict(spectrogram)

	output_text = decode_batch_predictions(pred)

	return output_text


	# testing model
	print(AudioToText('testWav.wav'))