Niwood commited on
Commit
4fdf7c7
·
1 Parent(s): 34b6d05
Files changed (2) hide show
  1. app.py +63 -4
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,8 +1,12 @@
1
  import gradio as gr
2
  import time
3
- import whisper
4
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoFeatureExtractor
5
  from transformers import pipeline
 
 
 
 
 
6
 
7
  import os
8
 
@@ -25,13 +29,68 @@ pipe: pipeline = pipeline(
25
  feature_extractor=feature_extractor
26
  )
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def inference(audio):
30
 
31
  time.sleep(0.1)
32
  # load audio and pad/trim it to fit 30 seconds
33
- audio = whisper.load_audio(audio)
34
- audio = whisper.pad_or_trim(audio)
35
 
36
  # # make log-Mel spectrogram and move to the same device as the model
37
  # mel = whisper.log_mel_spectrogram(audio).to(base_model.device)
@@ -46,7 +105,7 @@ def inference(audio):
46
 
47
 
48
  gr.Interface(
49
- title = 'Retrained whisper_sv_SE_small 😎',
50
  fn=inference,
51
  inputs=[
52
  gr.inputs.Audio(source="microphone", type="filepath")
 
1
  import gradio as gr
2
  import time
 
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoFeatureExtractor
4
  from transformers import pipeline
5
+ import ffmpeg
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
 
11
  import os
12
 
 
29
  feature_extractor=feature_extractor
30
  )
31
 
32
+ SAMPLE_RATE = 16000
33
+ def load_audio(file: str, sr: int = SAMPLE_RATE):
34
+ """
35
+ Open an audio file and read as mono waveform, resampling as necessary
36
+
37
+ Parameters
38
+ ----------
39
+ file: str
40
+ The audio file to open
41
+
42
+ sr: int
43
+ The sample rate to resample the audio if necessary
44
+
45
+ Returns
46
+ -------
47
+ A NumPy array containing the audio waveform, in float32 dtype.
48
+ """
49
+ try:
50
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
51
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
52
+ out, _ = (
53
+ ffmpeg.input(file, threads=0)
54
+ .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
55
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
56
+ )
57
+ except ffmpeg.Error as e:
58
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
59
+
60
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
61
+
62
+ CHUNK_LENGTH = 30
63
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk
64
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
65
+ """
66
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
67
+ """
68
+ if torch.is_tensor(array):
69
+ if array.shape[axis] > length:
70
+ array = array.index_select(dim=axis, index=torch.arange(length, device=array.device))
71
+
72
+ if array.shape[axis] < length:
73
+ pad_widths = [(0, 0)] * array.ndim
74
+ pad_widths[axis] = (0, length - array.shape[axis])
75
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
76
+ else:
77
+ if array.shape[axis] > length:
78
+ array = array.take(indices=range(length), axis=axis)
79
+
80
+ if array.shape[axis] < length:
81
+ pad_widths = [(0, 0)] * array.ndim
82
+ pad_widths[axis] = (0, length - array.shape[axis])
83
+ array = np.pad(array, pad_widths)
84
+
85
+ return array
86
+
87
 
88
  def inference(audio):
89
 
90
  time.sleep(0.1)
91
  # load audio and pad/trim it to fit 30 seconds
92
+ audio = load_audio(audio)
93
+ audio = pad_or_trim(audio)
94
 
95
  # # make log-Mel spectrogram and move to the same device as the model
96
  # mel = whisper.log_mel_spectrogram(audio).to(base_model.device)
 
105
 
106
 
107
  gr.Interface(
108
+ title = 'Robins finetuned whisper_sv_SE_small 😎',
109
  fn=inference,
110
  inputs=[
111
  gr.inputs.Audio(source="microphone", type="filepath")
requirements.txt CHANGED
@@ -12,11 +12,13 @@ contourpy==1.0.6
12
  cryptography==38.0.4
13
  cycler==0.11.0
14
  fastapi==0.88.0
 
15
  ffmpy==0.3.0
16
  filelock==3.8.0
17
  fonttools==4.38.0
18
  frozenlist==1.3.3
19
  fsspec==2022.11.0
 
20
  gradio==3.12.0
21
  h11==0.12.0
22
  httpcore==0.15.0
 
12
  cryptography==38.0.4
13
  cycler==0.11.0
14
  fastapi==0.88.0
15
+ ffmpeg-python==0.2.0
16
  ffmpy==0.3.0
17
  filelock==3.8.0
18
  fonttools==4.38.0
19
  frozenlist==1.3.3
20
  fsspec==2022.11.0
21
+ future==0.18.2
22
  gradio==3.12.0
23
  h11==0.12.0
24
  httpcore==0.15.0