Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
import numpy as np | |
import librosa | |
from efficientat.models.MobileNetV3 import get_model as get_mobilenet, get_ensemble_model | |
from efficientat.models.preprocess import AugmentMelSTFT | |
from efficientat.helpers.utils import NAME_TO_WIDTH, labels | |
from torch import autocast | |
from contextlib import nullcontext | |
MODEL_NAME = "mn40_as" | |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
model = get_mobilenet(width_mult=NAME_TO_WIDTH(MODEL_NAME), pretrained_name=MODEL_NAME) | |
model.to(device) | |
model.eval() | |
def audio_tag( | |
audio_path, | |
sample_rate=32000, | |
window_size=800, | |
hop_size=320, | |
n_mels=128, | |
cuda=True, | |
): | |
(waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) | |
mel = AugmentMelSTFT(n_mels=n_mels, sr=sample_rate, win_length=window_size, hopsize=hop_size) | |
mel.to(device) | |
mel.eval() | |
waveform = torch.from_numpy(waveform[None, :]).to(device) | |
# our models are trained in half precision mode (torch.float16) | |
# run on cuda with torch.float16 to get the best performance | |
# running on cpu with torch.float32 gives similar performance, using torch.bfloat16 is worse | |
with torch.no_grad(), autocast(device_type=device.type) if cuda and torch.cuda.is_available() else nullcontext(): | |
spec = mel(waveform) | |
preds, features = model(spec.unsqueeze(0)) | |
preds = torch.sigmoid(preds.float()).squeeze().cpu().numpy() | |
sorted_indexes = np.argsort(preds)[::-1] | |
output = {} | |
# Print audio tagging top probabilities | |
for k in range(10): | |
output[sorted_indexes[k]] = labels[sorted_indexes[k]] | |
return "\n".join(output.values()) | |
def formatted_message(audio_length, audio_class, userText): | |
prefix = '''You are going to act as a magical tool that allows for humans to communicate with non-human entities like | |
rocks, crackling fire, trees, animals, and the wind. In order to do this, we're going to provide you a data string which | |
represents the audio input, the source of the audio, and the human's text input for the conversation. | |
The goal is for you to embody the source of the audio, and use the length and variance in the signal data to produce | |
plausible responses to the humans input. Remember to embody the the source data. When we start the conversation, | |
you should generate a "personality profile" for the source and utilize that personality profile in your responses. | |
Let's begin:''' | |
suffix = f'''Source: {audio_class} | |
Length of Audio in Seconds: {audio_length} | |
Human Input: {userText} | |
{audio_class} Response:''' | |
template = prefix + suffix | |
response = call_api(template) | |
return response | |
def call_api(message): | |
""" | |
response = requests.get(f'{api}?q={message}') | |
if response.status_code == 200: | |
return str(response.text).split('\n', 2)[2] | |
else: | |
return Sorry, I'm quite busy right now, but please try again later :) | |
""" | |
return message | |
demo = gr.Interface( | |
audio_tag, | |
gr.Audio(source="upload", type="filepath", label="Your audio"), | |
gr.Textbox(), | |
).launch(debug=True) | |