import gradio as gr import tensorflow as tf import librosa import numpy as np from huggingface_hub import hf_hub_download # Mel Spectrogram parameters n_fft = 512 # FFT window length hop_length = 160 # number of samples between successive frames n_mels = 80 # Number of Mel bands fmin = 0.0 # Minimum frequency fmax = 8000.0 # Maximum frequency sampling_rate = 16000 def extract_mel_spectrogram(audio) -> np.ndarray: spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0) spectrogram = librosa.power_to_db(spectrogram, ref=np.max) #spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model return spectrogram # Download model from Hugging Face Hub model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="kobraspeech.17-40.19.keras") model = tf.keras.models.load_model(model_path) def decode_batch_predictions(pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] # Iterate over the results and get back the text output_text = [] for result in results: result = label_to_string(result) output_text.append(result) return output_text def transcribe(audio_path): # Load audio audio, _ = librosa.load(audio_path, sr=sampling_rate) # Extract features features = extract_mel_spectrogram(audio) # Model expects batch dimension features = np.expand_dims(features, axis=0) # Predict prediction = model.predict(features) # Assuming you have a method to decode the prediction into text transcription = decode_batch_predictions(prediction) return transcription[0] # Create Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.inputs.Audio(source="microphone", type="filepath"), outputs="text", title="Kobraspeech RNN ASR demo (cs)", description="Upload an audio file or record your voice to get the transcription." ) if __name__ == "__main__": iface.launch()