Spaces:
Runtime error
Runtime error
feat: add infer, utils and change app.py for gradio
Browse files- app.py +22 -15
- assets/articles.md +1 -0
- assets/audio/readme.md +6 -0
- assets/descriptions.md +1 -0
- src/__init__.py +0 -0
- src/infer.py +28 -0
- src/utils.py +29 -0
- tests/__init__.py +0 -0
app.py
CHANGED
@@ -1,21 +1,28 @@
|
|
1 |
-
from
|
2 |
import gradio as gr
|
3 |
|
4 |
-
p = pipeline("automatic-speech-recognition")
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
|
11 |
-
gr.Interface(
|
12 |
-
fn=
|
13 |
inputs=[
|
14 |
-
gr.
|
15 |
-
|
|
|
|
|
|
|
16 |
],
|
17 |
-
outputs=[
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
1 |
+
from src import infer, utils
|
2 |
import gradio as gr
|
3 |
|
|
|
4 |
|
5 |
+
audio_examples = [
|
6 |
+
[None, "assets/audio/male-indonesian.wav", None],
|
7 |
+
[None, "assets/audio/female-indonesian.wav", None],
|
8 |
+
[None, "assets/audio/male-english.wav", None],
|
9 |
+
[None, "assets/audio/female-english.wav", None],
|
10 |
+
]
|
11 |
|
12 |
+
demo = gr.Interface(
|
13 |
+
fn=infer.predict,
|
14 |
inputs=[
|
15 |
+
gr.Radio(label="Language",
|
16 |
+
choices=["indonesian","english"],
|
17 |
+
value="indonesian"),
|
18 |
+
gr.Audio(label="Speak", source="microphone", type="numpy"),
|
19 |
+
gr.Audio(label="Upload audio", source="upload", type="numpy"),
|
20 |
],
|
21 |
+
outputs=[gr.TextArea(label="Output Text"),],
|
22 |
+
title="OpenAI Whisper Base",
|
23 |
+
description=utils.parsing_text("assets/descriptions.md"),
|
24 |
+
article=""utils.parsing_text("assets/articles.md"),
|
25 |
+
examples=audio_examples,
|
26 |
+
)
|
27 |
+
|
28 |
+
demo.launch()
|
assets/articles.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
articles file
|
assets/audio/readme.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
noteId: "b807e330385e11eeb3554fca5e7f7a03"
|
3 |
+
tags: []
|
4 |
+
|
5 |
+
---
|
6 |
+
|
assets/descriptions.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
description files
|
src/__init__.py
ADDED
File without changes
|
src/infer.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import *
|
3 |
+
from src import utils
|
4 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
5 |
+
|
6 |
+
model_name: str = f"openai/whisper-base"
|
7 |
+
processor: Any = WhisperProcessor.from_pretrained(model_name)
|
8 |
+
model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
|
9 |
+
|
10 |
+
sample_rate: int = 16000
|
11 |
+
float_factor: float = 32678.0
|
12 |
+
|
13 |
+
|
14 |
+
def predict(language, mic_audio=None, audio=None):
|
15 |
+
if mic_audio is not None:
|
16 |
+
sampling_rate, waveform = mic_audio
|
17 |
+
elif audio is not None:
|
18 |
+
sampling_rate, waveform = audio
|
19 |
+
else:
|
20 |
+
return "(please provide audio)"
|
21 |
+
|
22 |
+
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
|
23 |
+
|
24 |
+
waveform = utils.preprocess_audio(sampling_rate, waveform)
|
25 |
+
inputs = processor(audio=waveform, sampling_rate=sample_rate, return_tensors="pt")
|
26 |
+
predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
|
27 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
28 |
+
return transcription[0]
|
src/utils.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import librosa
|
3 |
+
import torch
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
sample_rate: int = 16000
|
7 |
+
float_factor: float = 32678.0
|
8 |
+
|
9 |
+
def preprocess_audio(sampling_rate, waveform):
|
10 |
+
waveform = waveform / float_factor
|
11 |
+
|
12 |
+
if len(waveform) > 1:
|
13 |
+
waveform = librosa.to_mono(waveform.T)
|
14 |
+
|
15 |
+
if sampling_rate != sample_rate:
|
16 |
+
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
|
17 |
+
|
18 |
+
waveform = waveform[:sample_rate*30]
|
19 |
+
waveform = torch.tensor(waveform)
|
20 |
+
return waveform
|
21 |
+
|
22 |
+
|
23 |
+
def parsing_text(filepath: str):
|
24 |
+
path = Path(filepath)
|
25 |
+
if path.suffix.lower() not in ('.txt', '.md'):
|
26 |
+
raise ValueError("Invalid file type. Only '.txt' and '.md' files are supported.")
|
27 |
+
|
28 |
+
return path.read_text()
|
29 |
+
|
tests/__init__.py
ADDED
File without changes
|