import gradio as gr import torch from transformers import pipeline from timeit import default_timer as timer username = "fmagot01" ## Complete your username model_id = f"{username}/distil-wav2vec2-finetuned-giga-speech" device = "cuda:0" if torch.cuda.is_available() else "cpu" pipe = pipeline("audio-classification", model=model_id, device=device) def predict_trunc(filepath): preprocessed = pipe.preprocess(filepath) truncated = pipe.feature_extractor.pad(preprocessed,truncation=True, max_length = 16_000*30) model_outputs = pipe.forward(truncated) outputs = pipe.postprocess(model_outputs) return outputs def classify_audio(filepath): """ Goes from [{'score': 0.8339303731918335, 'label': 'Gaming'}, {'score': 0.11914275586605072, 'label': 'Audiobook'},] to {"Gaming": 0.8339303731918335, "Audiobook":0.11914275586605072} """ start_time = timer() #preds = pipe(filepath) preds = predict_trunc(filepath) outputs = {} pred_time = round(timer() - start_time, 5) for p in preds: outputs[p["label"]] = p["score"] return outputs, pred_time #return outputs title = "Classifier of Audio Files" description = """ This demo shows the application of the [distil-wav2vec2](https://huggingface.co/OthmaneJ/distil-wav2vec2) model fine tuned to the [gigaspeech](https://huggingface.co/datasets/speechcolab/gigaspeech) dataset. It will classify the audio provided to the domain of the content in it. """ filenames = ["audiobook.mp3"] filenames = [[f"./{f}"] for f in filenames] demo = gr.Interface( fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs=[gr.outputs.Label(label="Predictions"), gr.Number(label="Prediction time (s)") ], title=title, description=description, examples=filenames, ) demo.launch()