3loi's picture
Update app.py
8dda170 verified
raw history blame
No virus
1.16 kB
from transformers import pipeline
from transformers import AutoModelForAudioClassification
import gradio as gr
import librosa
import torch
import numpy as np
def classify_audio(audio_file):
model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes", trust_remote_code=True)
sr, raw_wav = audio_file
print(audio_file, audio_file[1].dtype)
y = raw_wav.astype(np.float32)
y /= np.max(np.abs(y))
#raw_wav, _ librosa.load(audio_file, sr=16000)
norm_wav = (y - mean) / (std+0.000001)
mask = torch.ones(1, len(norm_wav))
wavs = torch.tensor(norm_wav).unsqueeze(0)
pred = model(wavs, mask).detach().numpy()
print(str(pred))
return str(pred)
def main():
iface = gr.Interface(fn=classify_audio, inputs=gr.Audio(sources=["upload", "microphone"], label="Audio file"),
outputs=gr.Text(), title="Speech Emotion Recognition App",
description="Upload an audio file and hit the 'Submit'\
button")
iface.launch()
if __name__ == '__main__':
main()