baaastien commited on
Commit
bd81cd1
1 Parent(s): e9103a2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import os
4
+ import random2
5
+ from spleeter.separator import Separator
6
+ from transformers import pipeline
7
+
8
+ # Initiate a file separator with 2 stems (instruments and vocals) and 16khz bitrate, required for ASR
9
+ separator = Separator('spleeter:2stems-16kHz')
10
+
11
+ # Initiate Speech to text model with Wave2Vec english
12
+ # https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
13
+ pipe = pipeline("automatic-speech-recognition", "jonatasgrosman/wav2vec2-large-xlsr-53-english")
14
+
15
+ # Gradio function to split audio stems, transcribe vocals and return their filepaths
16
+ def extract_stems(audio):
17
+
18
+ # initiate a unique folder name for splitted files
19
+ foldername = str(random2.randrange(100000000))
20
+
21
+ # Separate audio input. Synchronous is true to wait for the end of split before going further
22
+ separator.separate_to_file(audio, "output/", filename_format= foldername + "/{instrument}.wav", synchronous=True)
23
+
24
+ # build filepaths for vocals and accompaniment files
25
+ vocals = f"./output/"+ foldername +"/vocals.wav"
26
+ accompaniment = f"./output/"+ foldername +"/accompaniment.wav"
27
+
28
+ # Get a transcript of the vocals, by using the huggingface pipeline
29
+ transcript = pipe(vocals, chunk_length_s=10, decoder=None)
30
+
31
+ return vocals, accompaniment, transcript
32
+
33
+ # Launch a Gradio interface
34
+ # Input is an audio file,
35
+ # Output is two audio files and a transcript
36
+
37
+ title = "Demo: Deezer Spleeter + english Automatic Speech Recognition"
38
+ description = "This demo is a basic interface for <a href='https://research.deezer.com/projects/spleeter.html' target='_blank'>Deezer Spleeter</a>. It uses the Spleeter library for separate audio file in two stems : accompaniments and vocals. Once splitted, it performs ASR (Automatic Speech Recognition) based on a Wav2vec2 english model."
39
+ examples = [["examples/" + mp3] for mp3 in os.listdir("examples/")]
40
+
41
+ demo = gr.Interface(
42
+ fn=extract_stems,
43
+ inputs=gr.Audio(source="upload", type="filepath"),
44
+ outputs=[gr.Audio(label="Vocals stem", source="upload", type="filepath"), gr.Audio(label="Accompaniment stem", source="upload", type="filepath"), gr.Textbox(label="Automatic Speech Recognition (English)")],
45
+ title=title,
46
+ description=description,
47
+ examples=examples,
48
+ allow_flagging="never"
49
+ )
50
+
51
+ demo.launch()