Spaces:
Sleeping
Sleeping
import gradio as gr | |
import tensorflow as tf | |
import librosa | |
import numpy as np | |
from huggingface_hub import hf_hub_download | |
# Mel Spectrogram parameters | |
n_fft = 512 # FFT window length | |
hop_length = 160 # number of samples between successive frames | |
n_mels = 80 # Number of Mel bands | |
fmin = 0.0 # Minimum frequency | |
fmax = 8000.0 # Maximum frequency | |
sampling_rate = 16000 | |
def extract_mel_spectrogram(audio) -> np.ndarray: | |
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length, | |
n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0) | |
spectrogram = librosa.power_to_db(spectrogram, ref=np.max) | |
#spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model | |
return spectrogram | |
# Download model from Hugging Face Hub | |
model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="kobraspeech.17-40.19.keras") | |
model = tf.keras.models.load_model(model_path) | |
def decode_batch_predictions(pred): | |
input_len = np.ones(pred.shape[0]) * pred.shape[1] | |
# Use greedy search. For complex tasks, you can use beam search | |
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] | |
# Iterate over the results and get back the text | |
output_text = [] | |
for result in results: | |
result = label_to_string(result) | |
output_text.append(result) | |
return output_text | |
def transcribe(audio_path): | |
# Load audio | |
audio, _ = librosa.load(audio_path, sr=sampling_rate) | |
# Extract features | |
features = extract_mel_spectrogram(audio) | |
# Model expects batch dimension | |
features = np.expand_dims(features, axis=0) | |
# Predict | |
prediction = model.predict(features) | |
# Assuming you have a method to decode the prediction into text | |
transcription = decode_batch_predictions(prediction) | |
return transcription[0] | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=gr.inputs.Audio(source="microphone", type="filepath"), | |
outputs="text", | |
title="Kobraspeech RNN ASR demo (cs)", | |
description="Upload an audio file or record your voice to get the transcription." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |