Spaces:

Niwood
/

Whisper_sv_SE

Runtime error

App Files Files Community

Niwood commited on Dec 4, 2022

Commit

4fdf7c7

1 Parent(s): 34b6d05

bump

Browse files

Files changed (2) hide show

app.py +63 -4
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import gradio as gr
 import time
-import whisper
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoFeatureExtractor
 from transformers import pipeline
 import os
@@ -25,13 +29,68 @@ pipe: pipeline = pipeline(
     feature_extractor=feature_extractor
     )
 def inference(audio):
     time.sleep(0.1)
     # load audio and pad/trim it to fit 30 seconds
-    audio = whisper.load_audio(audio)
-    audio = whisper.pad_or_trim(audio)
     # # make log-Mel spectrogram and move to the same device as the model
     # mel = whisper.log_mel_spectrogram(audio).to(base_model.device)
@@ -46,7 +105,7 @@ def inference(audio):
 gr.Interface(
-    title = 'Retrained whisper_sv_SE_small 😎',
     fn=inference,
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath")

 import gradio as gr
 import time
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoFeatureExtractor
 from transformers import pipeline
+import ffmpeg
+import numpy as np
+import torch
+import torch.nn.functional as F
 import os
     feature_extractor=feature_extractor
     )
+SAMPLE_RATE = 16000
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(dim=axis, index=torch.arange(length, device=array.device))
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
 def inference(audio):
     time.sleep(0.1)
     # load audio and pad/trim it to fit 30 seconds
+    audio = load_audio(audio)
+    audio = pad_or_trim(audio)
     # # make log-Mel spectrogram and move to the same device as the model
     # mel = whisper.log_mel_spectrogram(audio).to(base_model.device)
 gr.Interface(
+    title = 'Robins finetuned whisper_sv_SE_small 😎',
     fn=inference,
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath")

requirements.txt CHANGED Viewed

@@ -12,11 +12,13 @@ contourpy==1.0.6
 cryptography==38.0.4
 cycler==0.11.0
 fastapi==0.88.0
 ffmpy==0.3.0
 filelock==3.8.0
 fonttools==4.38.0
 frozenlist==1.3.3
 fsspec==2022.11.0
 gradio==3.12.0
 h11==0.12.0
 httpcore==0.15.0

 cryptography==38.0.4
 cycler==0.11.0
 fastapi==0.88.0
+ffmpeg-python==0.2.0
 ffmpy==0.3.0
 filelock==3.8.0
 fonttools==4.38.0
 frozenlist==1.3.3
 fsspec==2022.11.0
+future==0.18.2
 gradio==3.12.0
 h11==0.12.0
 httpcore==0.15.0