ravi.naik commited on
Commit
aa2482a
1 Parent(s): 667ae00

Fixed audio sampling issues

Browse files
Files changed (3) hide show
  1. app.py +4 -2
  2. inference/main.py +9 -1
  3. requirements.txt +3 -1
app.py CHANGED
@@ -94,7 +94,9 @@ with gr.Blocks() as demo:
94
  with gr.Row():
95
  # Add audio
96
  audio_upload = gr.Audio(source="upload", type="filepath")
97
- audio_mic = gr.Audio(source="microphone", type="filepath")
 
 
98
 
99
  with gr.Column(scale=8):
100
  with gr.Box():
@@ -123,4 +125,4 @@ with gr.Blocks() as demo:
123
  outputs=[prompt, image, audio_upload, audio_mic, chatbot],
124
  )
125
 
126
- demo.launch()
 
94
  with gr.Row():
95
  # Add audio
96
  audio_upload = gr.Audio(source="upload", type="filepath")
97
+ audio_mic = gr.Audio(
98
+ source="microphone", type="filepath", format="mp3"
99
+ )
100
 
101
  with gr.Column(scale=8):
102
  with gr.Box():
 
125
  outputs=[prompt, image, audio_upload, audio_mic, chatbot],
126
  )
127
 
128
+ demo.launch(server_port=8881)
inference/main.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import torch
2
  from transformers import (
3
  AutoTokenizer,
@@ -47,8 +49,14 @@ class WhisperWithProjection:
47
  # self.audio_language_connector = AudioLanguageConnector(projection_dim)
48
 
49
  def __call__(self, audio):
 
 
 
 
 
 
50
  input_features = self.processor(
51
- audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
52
  ).input_features
53
  # generate token ids
54
  predicted_ids = self.model.generate(input_features.to(self.device))
 
1
+ import soundfile as sf
2
+ import librosa
3
  import torch
4
  from transformers import (
5
  AutoTokenizer,
 
49
  # self.audio_language_connector = AudioLanguageConnector(projection_dim)
50
 
51
  def __call__(self, audio):
52
+ array, sampling_rate = sf.read(audio)
53
+ resampled_array = librosa.resample(
54
+ array,
55
+ orig_sr=sampling_rate,
56
+ target_sr=16000,
57
+ )
58
  input_features = self.processor(
59
+ resampled_array, sampling_rate=16000, return_tensors="pt"
60
  ).input_features
61
  # generate token ids
62
  predicted_ids = self.model.generate(input_features.to(self.device))
requirements.txt CHANGED
@@ -16,4 +16,6 @@ transformers==4.36.2
16
  accelerate==0.21.0
17
  bitsandbytes==0.41.0
18
  scikit-learn==1.2.2
19
- sentencepiece==0.1.99
 
 
 
16
  accelerate==0.21.0
17
  bitsandbytes==0.41.0
18
  scikit-learn==1.2.2
19
+ sentencepiece==0.1.99
20
+ librosa
21
+ soundfile