lyrics / app.py
demomodels's picture
Added audio separation
0c16d63 verified
raw
history blame contribute delete
No virus
1.79 kB
import gradio as gr
import torch
import json
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# This code was omitted for deployment reasons (model is too RAM-hungry)
# from speechbrain.inference.separation import SepformerSeparation as separator
# import torchaudio
# model = separator.from_hparams(source="speechbrain/sepformer-whamr16k", savedir='pretrained_models/sepformer-whamr16k')
# def separate_speech(path):
# est_sources = model.separate_file(path=path)
# output_path = "output.wav"
# torchaudio.save(output_path, est_sources[:, :, 0].detach().cpu(), 16000)
# return output_path
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-tiny"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=15,
batch_size=1,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
def transcribe_speech(filepath):
result = pipe(filepath)['chunks']
for item in result:
item['timestamp'] = list(item['timestamp'])
return json.dumps(result)
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs="text",
)
with demo:
gr.TabbedInterface(
[file_transcribe],
["Song Lyrics"],
)
demo.launch(debug=True)