File size: 1,964 Bytes
5557412
 
 
feec576
89bf06f
 
 
 
 
 
 
6b4e503
5557412
89bf06f
1ebf1c8
 
b2efac4
89bf06f
 
 
1ebf1c8
 
8ffa72a
89bf06f
 
 
142f6ec
89bf06f
 
3b87efa
89bf06f
 
 
c29a642
1ebf1c8
89bf06f
 
 
 
 
 
5557412
 
9585e6d
1ebf1c8
3b87efa
89bf06f
5557412
 
 
 
 
9585e6d
5557412
 
3b87efa
5557412
 
 
 
 
 
 
89bf06f
5557412
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess


def resample(speech_array, sampling_rate):
    resampler = torchaudio.transforms.Resample(sampling_rate)
    speech = resampler(speech_array).squeeze().astype("double")
    return speech


def predict(speech_array, sampling_rate):
    speech = resample(speech_array, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model.to(device)(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{config.id2label[i]: f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs
 

TRUST = True
SR = 16000

config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def transcribe(audio):
    sr, audio = audio[0], audio[1]
    return predict(audio, sr)[0]


def get_asr_interface():
    return gr.Interface(
        fn=transcribe, 
        inputs=[
            gr.inputs.Audio(source="upload", type="numpy")
        ],
        outputs=[
            "json"
        ])
        
interfaces = [
    get_asr_interface()
]

names = [
    "Russian Emotion Recognition"
]

gr.TabbedInterface(interfaces, names).launch(server_name = "0.0.0.0", enable_queue=False)