nunenuh commited on
Commit
70c05b8
1 Parent(s): 8d023c9

feat: fix with original whisper

Browse files
app.py CHANGED
@@ -9,19 +9,27 @@ audio_examples = [
9
  [None, "assets/audio/female-english.wav", None],
10
  ]
11
 
 
 
 
 
12
  demo = gr.Interface(
13
  fn=infer.predict,
14
  inputs=[
 
 
 
 
15
  gr.Radio(label="Language",
16
  choices=["indonesian","english"],
17
  value="indonesian"),
18
- gr.Audio(label="Speak", source="microphone", type="numpy"),
19
- gr.Audio(label="Upload Audio", source="upload", type="numpy"),
20
  ],
21
  outputs=[gr.TextArea(label="Output Text"),],
22
- title="OpenAI Whisper Base",
23
- description=utils.parsing_text("assets/descriptions.md"),
24
- article=utils.parsing_text("assets/articles.md"),
25
  # examples=audio_examples,
26
  )
27
 
 
9
  [None, "assets/audio/female-english.wav", None],
10
  ]
11
 
12
+ TITLE = "OpenAI Whisper"
13
+ DESCRIPTION = utils.parsing_text("assets/descriptions.md")
14
+ ARTICLE = utils.parsing_text("assets/articles.md")
15
+
16
  demo = gr.Interface(
17
  fn=infer.predict,
18
  inputs=[
19
+ gr.Dropdown(
20
+ label="Model",
21
+ choices=["tiny","small","base","medium","large","large-v2"],
22
+ value="base"),
23
  gr.Radio(label="Language",
24
  choices=["indonesian","english"],
25
  value="indonesian"),
26
+ gr.Audio(label="Speak", source="microphone", type="filepath"),
27
+ gr.Audio(label="Upload Audio", source="upload", type="filepath"),
28
  ],
29
  outputs=[gr.TextArea(label="Output Text"),],
30
+ title=TITLE,
31
+ description=DESCRIPTION,
32
+ article=ARTICLE,
33
  # examples=audio_examples,
34
  )
35
 
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ audio,state,output 0,state,flag,username,timestamp
2
+ ,,,,,,2023-08-11 19:42:07.779875
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- torch
 
2
  transformers
3
- librosa
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/openai/whisper.git
3
  transformers
4
+ ffmpeg-python==0.2.0
5
+ gradio==3.38.0
6
+ torchaudio
7
+ altair
8
+ json5
src/__pycache__/infer.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/infer.cpython-310.pyc and b/src/__pycache__/infer.cpython-310.pyc differ
 
src/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/utils.cpython-310.pyc and b/src/__pycache__/utils.cpython-310.pyc differ
 
src/infer.py CHANGED
@@ -2,27 +2,18 @@
2
  from typing import *
3
  from src import utils
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
5
 
6
- model_name: str = f"openai/whisper-medium"
7
- processor: Any = WhisperProcessor.from_pretrained(model_name)
8
- model: Any = WhisperForConditionalGeneration.from_pretrained(model_name)
9
-
10
- sample_rate: int = 16000
11
- float_factor: float = 32678.0
12
-
13
-
14
- def predict(language, mic_audio=None, audio=None):
15
  if mic_audio is not None:
16
- sampling_rate, waveform = mic_audio
17
  elif audio is not None:
18
- sampling_rate, waveform = audio
19
  else:
20
  return "(please provide audio)"
21
 
22
- forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
23
 
24
- waveform = utils.preprocess_audio(sampling_rate, waveform)
25
- inputs = processor(audio=waveform, sampling_rate=sample_rate, return_tensors="pt")
26
- predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
27
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
28
- return transcription[0]
 
2
  from typing import *
3
  from src import utils
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
+ import whisper
6
 
7
+ def predict(model_name, language, mic_audio=None, audio=None):
 
 
 
 
 
 
 
 
8
  if mic_audio is not None:
9
+ voice = mic_audio
10
  elif audio is not None:
11
+ voice = audio
12
  else:
13
  return "(please provide audio)"
14
 
15
+ voice = utils.preprocess_audio(voice)
16
 
17
+ model = whisper.load_model(model_name)
18
+ result = model.transcribe(voice, language=language)
19
+ return result["text"]
 
 
src/utils.py CHANGED
@@ -2,25 +2,18 @@
2
  import librosa
3
  import torch
4
  from pathlib import Path
 
5
 
6
  sample_rate: int = 16000
7
  float_factor: float = 32678.0
8
 
9
- def preprocess_audio(sampling_rate, waveform):
10
- waveform: float = waveform / float_factor
11
-
12
- if len(waveform.shape) > 1:
13
- waveform = librosa.to_mono(waveform.T)
14
-
15
- if sampling_rate != sample_rate:
16
- waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=sample_rate)
17
-
18
- # limit to 30 seconds
19
- waveform: float = waveform[:sample_rate * 30]
20
-
21
- waveform: float = torch.tensor(waveform)
22
- return waveform
23
 
 
 
 
 
 
 
24
 
25
  def parsing_text(filepath: str):
26
  path = Path(filepath)
 
2
  import librosa
3
  import torch
4
  from pathlib import Path
5
+ import whisper
6
 
7
  sample_rate: int = 16000
8
  float_factor: float = 32678.0
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def preprocess_audio(filepath: str):
12
+ # load audio and pad/trim it to fit 30 seconds
13
+ audio = whisper.load_audio(filepath)
14
+ audio = whisper.pad_or_trim(audio)
15
+
16
+ return audio
17
 
18
  def parsing_text(filepath: str):
19
  path = Path(filepath)