Spaces:

kobrasoft
/

kobraspeech-rnn-cs

Sleeping

App Files Files Community

kobraspeech-rnn-cs / app.py

kozak-vaclav

Update app.py

d471c64 verified 18 days ago

raw

history blame contribute delete

No virus

3.48 kB

	import gradio as gr
	import tensorflow as tf
	import librosa
	import numpy as np
	from huggingface_hub import hf_hub_download, from_pretrained_keras

	# Mel Spectrogram parameters
	n_fft = 512 # FFT window length
	hop_length = 160 # number of samples between successive frames
	n_mels = 80 # Number of Mel bands
	fmin = 0.0 # Minimum frequency
	fmax = 8000.0 # Maximum frequency
	sampling_rate = 16000

	def extract_mel_spectrogram(audio) -> np.ndarray:
	spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
	n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
	spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
	#spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model
	return spectrogram

	def CTCLoss(y_true, y_pred):
	# Compute the training-time loss value
	batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
	input_length = tf.math.reduce_sum(tf.cast(tf.not_equal(tf.reduce_max(y_pred, axis=2), 0), dtype="int64"), axis=1, keepdims=True)
	label_length = tf.math.reduce_sum(tf.cast(tf.not_equal(y_true, -1), dtype="int64"), axis=1, keepdims=True)

	loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
	return loss

	# Download model from Hugging Face Hub
	# model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="saved_model.pb")
	# with tf.keras.utils.custom_object_scope({'CTCLoss': CTCLoss}):
	# model = tf.keras.models.load_model(model_path)
	model = from_pretrained_keras("kobrasoft/kobraspeech-rnn-cs")

	import pickle as pkl

	num_to_char_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="num_to_char.json")
	with open(num_to_char_path, "rb") as f:
	num_to_char = tf.keras.layers.StringLookup(vocabulary=pkl.load(f), oov_token="", invert=True)

	def label_to_string(label):
	return tf.strings.reduce_join(num_to_char(label)).numpy().decode()

	def decode_batch_predictions(pred):
	input_len = np.ones(pred.shape[0]) * pred.shape[1]
	# Use greedy search. For complex tasks, you can use beam search
	results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
	# Iterate over the results and get back the text
	output_text = []
	for result in results:
	result = label_to_string(result)
	output_text.append(result)
	return output_text

	def transcribe(audio_path):
	# Load audio
	audio, _ = librosa.load(audio_path, sr=sampling_rate)

	# Extract features
	features = extract_mel_spectrogram(audio)

	# Model expects batch dimension
	features = np.expand_dims(features, axis=0)

	# Predict
	prediction = model.predict(features)

	# Assuming you have a method to decode the prediction into text
	transcription = decode_batch_predictions(prediction)

	return transcription[0]

	demo = gr.Blocks()

	mic_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs=gr.Textbox(),
	)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="upload", type="filepath"),
	outputs=gr.Textbox(),
	)

	with demo:
	gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Transcribe Microphone", "Transcribe Audio File"],
	)

	demo.launch(debug=True)

	if __name__ == "__main__":
	iface.launch()