kozak-vaclav commited on
Commit
8e20a50
1 Parent(s): f9bfdc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -14
app.py CHANGED
@@ -1,31 +1,63 @@
1
  import gradio as gr
2
  import tensorflow as tf
3
- from transformers import Wav2Vec2Processor, TFWav2Vec2Model
4
  import librosa
 
 
5
 
6
- # Load the model and processor
7
- processor = Wav2Vec2Processor.from_pretrained("openai/whisper-tiny")
8
- model = TFWav2Vec2Model.from_pretrained("kobrasoft/kobraspeech-rnn-cs")
 
 
 
 
9
 
10
- def transcribe(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Load audio
12
- audio, rate = librosa.load(audio, sr=16000)
 
 
 
 
 
 
 
 
 
13
 
14
- # Process audio
15
- inputs = processor(audio, sampling_rate=rate, return_tensors="tf", padding="longest")
16
- logits = model(inputs.input_values).logits
17
 
18
- # Decode the logits
19
- predicted_ids = tf.argmax(logits, axis=-1)
20
- transcription = processor.batch_decode(predicted_ids)[0]
21
- return transcription
22
 
23
  # Create Gradio interface
24
  iface = gr.Interface(
25
  fn=transcribe,
26
  inputs=gr.inputs.Audio(source="microphone", type="filepath"),
27
  outputs="text",
28
- title="ASR Model Demo",
29
  description="Upload an audio file or record your voice to get the transcription."
30
  )
31
 
 
1
  import gradio as gr
2
  import tensorflow as tf
 
3
  import librosa
4
+ import numpy as np
5
+ from huggingface_hub import hf_hub_download
6
 
7
+ # Mel Spectrogram parameters
8
+ n_fft = 512 # FFT window length
9
+ hop_length = 160 # number of samples between successive frames
10
+ n_mels = 80 # Number of Mel bands
11
+ fmin = 0.0 # Minimum frequency
12
+ fmax = 8000.0 # Maximum frequency
13
+ sampling_rate = 16000
14
 
15
+ def extract_mel_spectrogram(audio) -> np.ndarray:
16
+ spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
17
+ n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
18
+ spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
19
+ #spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model
20
+ return spectrogram
21
+
22
+ # Download model from Hugging Face Hub
23
+ model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="kobraspeech.17-40.19.keras")
24
+ model = tf.keras.models.load_model(model_path)
25
+
26
+ def decode_batch_predictions(pred):
27
+ input_len = np.ones(pred.shape[0]) * pred.shape[1]
28
+ # Use greedy search. For complex tasks, you can use beam search
29
+ results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
30
+ # Iterate over the results and get back the text
31
+ output_text = []
32
+ for result in results:
33
+ result = label_to_string(result)
34
+ output_text.append(result)
35
+ return output_text
36
+
37
+ def transcribe(audio_path):
38
  # Load audio
39
+ audio, _ = librosa.load(audio_path, sr=sampling_rate)
40
+
41
+ # Extract features
42
+ features = extract_mel_spectrogram(audio)
43
+
44
+ # Model expects batch dimension
45
+ features = np.expand_dims(features, axis=0)
46
+
47
+ # Predict
48
+ prediction = model.predict(features)
49
 
50
+ # Assuming you have a method to decode the prediction into text
51
+ transcription = decode_batch_predictions(prediction)
 
52
 
53
+ return transcription[0]
 
 
 
54
 
55
  # Create Gradio interface
56
  iface = gr.Interface(
57
  fn=transcribe,
58
  inputs=gr.inputs.Audio(source="microphone", type="filepath"),
59
  outputs="text",
60
+ title="Kobraspeech RNN ASR demo (cs)",
61
  description="Upload an audio file or record your voice to get the transcription."
62
  )
63