import gradio as gr import soundfile as sf import numpy as np import torch, torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from datasets import load_dataset, Audio import matplotlib.pyplot as plt MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" torch.random.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device) processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) # do i need this? can't remember #def greet(name): # return "Hello " + name + "!!" #iface = gr.Interface(fn=greet, inputs="text", outputs="text") #iface.launch() #api = gr.Interface.load("models/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h") #iface.launch() #ds = load_dataset("language-and-voice-lab/samromur_asr",split='train',streaming=True) #ds = load_dataset("language-and-voice-lab/samromur_asr",split='test') #ds = ds.cast_column("audio", Audio(sampling_rate=16_000)) def show_ex(exnum): #return(ds['audio_id'][exnum]) return(exnum) def recc(a_f): wav, sr = sf.read(a_f, dtype=np.float32) if len(wav.shape) == 2: wav = wav.mean(1) if sr != 16000: wlen = int(wav.shape[0] / sr * 16000) wav = signal.resample(wav, wlen) with torch.inference_mode(): wav = torch.from_numpy(wav).unsqueeze(0) if torch.cuda.is_available(): wav = wav.cuda() input_values = processor(wav).input_values return input_values bl = gr.Blocks() with bl: text_input = gr.Textbox() text_output = gr.Textbox() text_button = gr.Button("Run") #text_button.click(show_ex, inputs=text_input, outputs=text_output) audio_file = gr.Audio(type="filepath") #ipt = text_button.click(recc, inputs=audio_file, outputs=text_output) bl.launch() #https://mercury-docs.readthedocs.io/en/latest/deploy/hugging-face-spaces/ #https://huggingface.co/spaces/pplonski/deploy-mercury #https://discuss.huggingface.co/t/deploy-interactive-jupyter-notebook-on-spaces-with-mercury/17000 #https://huggingface.co/docs/transformers/notebooks