import gradio as gr import torch import json import numpy as np from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline # This code was omitted for deployment reasons (model is too RAM-hungry) # from speechbrain.inference.separation import SepformerSeparation as separator # import torchaudio # model = separator.from_hparams(source="speechbrain/sepformer-whamr16k", savedir='pretrained_models/sepformer-whamr16k') # def separate_speech(path): # est_sources = model.separate_file(path=path) # output_path = "output.wav" # torchaudio.save(output_path, est_sources[:, :, 0].detach().cpu(), 16000) # return output_path device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-tiny" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=1, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) def transcribe_speech(filepath): result = pipe(filepath)['chunks'] for item in result: item['timestamp'] = list(item['timestamp']) return json.dumps(result) demo = gr.Blocks() file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs="text", ) with demo: gr.TabbedInterface( [file_transcribe], ["Song Lyrics"], ) demo.launch(debug=True)