Spaces:
Build error
Build error
File size: 1,964 Bytes
5557412 feec576 89bf06f 6b4e503 5557412 89bf06f 1ebf1c8 b2efac4 89bf06f 1ebf1c8 8ffa72a 89bf06f 142f6ec 89bf06f 3b87efa 89bf06f c29a642 1ebf1c8 89bf06f 5557412 9585e6d 1ebf1c8 3b87efa 89bf06f 5557412 9585e6d 5557412 3b87efa 5557412 89bf06f 5557412 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess
def resample(speech_array, sampling_rate):
resampler = torchaudio.transforms.Resample(sampling_rate)
speech = resampler(speech_array).squeeze().astype("double")
return speech
def predict(speech_array, sampling_rate):
speech = resample(speech_array, sampling_rate)
inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits = model.to(device)(**inputs).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
outputs = [{config.id2label[i]: f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
return outputs
TRUST = True
SR = 16000
config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def transcribe(audio):
sr, audio = audio[0], audio[1]
return predict(audio, sr)[0]
def get_asr_interface():
return gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="numpy")
],
outputs=[
"json"
])
interfaces = [
get_asr_interface()
]
names = [
"Russian Emotion Recognition"
]
gr.TabbedInterface(interfaces, names).launch(server_name = "0.0.0.0", enable_queue=False) |