ahmedghani commited on
Commit
39e4af1
1 Parent(s): c65d563

added clean whisper asr implementation

Browse files
Files changed (2) hide show
  1. app.py +28 -24
  2. requirements.txt +1 -5
app.py CHANGED
@@ -4,7 +4,8 @@ from scipy.io.wavfile import write
4
  import gradio as gr
5
  import os
6
  from transformers import AutoProcessor, pipeline
7
- from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
 
8
  from glob import glob
9
  load_model()
10
 
@@ -12,27 +13,29 @@ BASE_PATH = os.path.dirname(os.path.abspath(__file__))
12
  os.makedirs('input', exist_ok=True)
13
  os.makedirs('separated', exist_ok=True)
14
 
15
- print("Loading ASR model...")
16
- processor = AutoProcessor.from_pretrained("openai/whisper-small")
17
- if not os.path.exists("whisper_checkpoint"):
18
- model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", from_transformers=True)
19
- speech_recognition_pipeline = pipeline(
20
- "automatic-speech-recognition",
21
- model=model,
22
- feature_extractor=processor.feature_extractor,
23
- tokenizer=processor.tokenizer,
24
- )
25
- os.makedirs('whisper_checkpoint', exist_ok=True)
26
- model.save_pretrained("whisper_checkpoint")
27
- else:
28
- model = ORTModelForSpeechSeq2Seq.from_pretrained("whisper_checkpoint", from_transformers=False)
29
- speech_recognition_pipeline = pipeline(
30
- "automatic-speech-recognition",
31
- model=model,
32
- feature_extractor=processor.feature_extractor,
33
- tokenizer=processor.tokenizer,
34
- )
35
- print("Whisper ASR model loaded.")
 
 
36
 
37
  def separator(audio, rec_audio, example):
38
  outputs= {}
@@ -51,8 +54,9 @@ def separator(audio, rec_audio, example):
51
  separated_files = [f for f in separated_files if "original.wav" not in f]
52
  outputs['transcripts'] = []
53
  for file in sorted(separated_files):
54
- separated_audio = sio.wavfile.read(file)
55
- outputs['transcripts'].append(speech_recognition_pipeline(separated_audio[1])['text'])
 
56
  return sorted(separated_files) + outputs['transcripts']
57
 
58
  def set_example_audio(example: list) -> dict:
 
4
  import gradio as gr
5
  import os
6
  from transformers import AutoProcessor, pipeline
7
+ # from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
8
+ import whisper
9
  from glob import glob
10
  load_model()
11
 
 
13
  os.makedirs('input', exist_ok=True)
14
  os.makedirs('separated', exist_ok=True)
15
 
16
+ # print("Loading ASR model...")
17
+ # processor = AutoProcessor.from_pretrained("openai/whisper-small")
18
+ # if not os.path.exists("whisper_checkpoint"):
19
+ # model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", from_transformers=True)
20
+ # speech_recognition_pipeline = pipeline(
21
+ # "automatic-speech-recognition",
22
+ # model=model,
23
+ # feature_extractor=processor.feature_extractor,
24
+ # tokenizer=processor.tokenizer,
25
+ # )
26
+ # os.makedirs('whisper_checkpoint', exist_ok=True)
27
+ # model.save_pretrained("whisper_checkpoint")
28
+ # else:
29
+ # model = ORTModelForSpeechSeq2Seq.from_pretrained("whisper_checkpoint", from_transformers=False)
30
+ # speech_recognition_pipeline = pipeline(
31
+ # "automatic-speech-recognition",
32
+ # model=model,
33
+ # feature_extractor=processor.feature_extractor,
34
+ # tokenizer=processor.tokenizer,
35
+ # )
36
+ # print("Whisper ASR model loaded.")
37
+
38
+ model = whisper.load_model("base")
39
 
40
  def separator(audio, rec_audio, example):
41
  outputs= {}
 
54
  separated_files = [f for f in separated_files if "original.wav" not in f]
55
  outputs['transcripts'] = []
56
  for file in sorted(separated_files):
57
+ # separated_audio = sio.wavfile.read(file)
58
+ # outputs['transcripts'].append(speech_recognition_pipeline(separated_audio[1])['text'])
59
+ outputs['transcripts'].append(whisper.transcribe(file)["text"])
60
  return sorted(separated_files) + outputs['transcripts']
61
 
62
  def set_example_audio(example: list) -> dict:
requirements.txt CHANGED
@@ -6,13 +6,9 @@ pystoi==0.3.3
6
  librosa==0.7.1
7
  numba==0.48
8
  numpy
9
- flask
10
- flask-cors
11
- uvicorn[standard]
12
  asgiref
13
  gradio
14
- transformers==4.24.0
15
  torch
16
  torchvision
17
  torchaudio
18
- optimum[onnxruntime]==1.5.0
 
6
  librosa==0.7.1
7
  numba==0.48
8
  numpy
 
 
 
9
  asgiref
10
  gradio
 
11
  torch
12
  torchvision
13
  torchaudio
14
+ whisper