Spaces:
Runtime error
Runtime error
fix: fixing error to generate text
Browse files- app.py +1 -1
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/infer.cpython-310.pyc +0 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/infer.py +1 -1
- src/utils.py +6 -4
app.py
CHANGED
@@ -16,7 +16,7 @@ demo = gr.Interface(
|
|
16 |
choices=["indonesian","english"],
|
17 |
value="indonesian"),
|
18 |
gr.Audio(label="Speak", source="microphone", type="numpy"),
|
19 |
-
gr.Audio(label="Upload
|
20 |
],
|
21 |
outputs=[gr.TextArea(label="Output Text"),],
|
22 |
title="OpenAI Whisper Base",
|
|
|
16 |
choices=["indonesian","english"],
|
17 |
value="indonesian"),
|
18 |
gr.Audio(label="Speak", source="microphone", type="numpy"),
|
19 |
+
gr.Audio(label="Upload Audio", source="upload", type="numpy"),
|
20 |
],
|
21 |
outputs=[gr.TextArea(label="Output Text"),],
|
22 |
title="OpenAI Whisper Base",
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (147 Bytes). View file
|
|
src/__pycache__/infer.cpython-310.pyc
ADDED
Binary file (1.14 kB). View file
|
|
src/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (940 Bytes). View file
|
|
src/infer.py
CHANGED
@@ -3,7 +3,7 @@ from typing import *
|
|
3 |
from src import utils
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
5 |
|
6 |
-
model_name: str = f"openai/whisper-
|
7 |
processor: Any = WhisperProcessor.from_pretrained(model_name)
|
8 |
model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
|
9 |
|
|
|
3 |
from src import utils
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
5 |
|
6 |
+
model_name: str = f"openai/whisper-small"
|
7 |
processor: Any = WhisperProcessor.from_pretrained(model_name)
|
8 |
model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
|
9 |
|
src/utils.py
CHANGED
@@ -7,16 +7,18 @@ sample_rate: int = 16000
|
|
7 |
float_factor: float = 32678.0
|
8 |
|
9 |
def preprocess_audio(sampling_rate, waveform):
|
10 |
-
waveform = waveform / float_factor
|
11 |
|
12 |
-
if len(waveform) > 1:
|
13 |
waveform = librosa.to_mono(waveform.T)
|
14 |
|
15 |
if sampling_rate != sample_rate:
|
16 |
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
|
17 |
|
18 |
-
|
19 |
-
waveform =
|
|
|
|
|
20 |
return waveform
|
21 |
|
22 |
|
|
|
7 |
float_factor: float = 32678.0
|
8 |
|
9 |
def preprocess_audio(sampling_rate, waveform):
|
10 |
+
waveform: float = waveform / float_factor
|
11 |
|
12 |
+
if len(waveform.shape) > 1:
|
13 |
waveform = librosa.to_mono(waveform.T)
|
14 |
|
15 |
if sampling_rate != sample_rate:
|
16 |
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
|
17 |
|
18 |
+
# limit to 30 seconds
|
19 |
+
waveform: float = waveform[:sample_rate * 30]
|
20 |
+
|
21 |
+
waveform: float = torch.tensor(waveform)
|
22 |
return waveform
|
23 |
|
24 |
|