File size: 2,203 Bytes
5557412
 
 
feec576
89bf06f
 
 
 
 
 
 
6b4e503
050f024
5557412
03b4fa3
 
 
89bf06f
638437d
55f31c4
84a3e63
638437d
84a3e63
89bf06f
 
 
638437d
 
3032b2a
638437d
89bf06f
 
 
142f6ec
89bf06f
 
cd6af6a
89bf06f
 
 
 
 
 
 
 
879ffa6
5557412
 
6f1ac69
638437d
55f31c4
302433f
67bedc5
89bf06f
5557412
7f185f4
 
 
 
 
 
 
c5e3e64
55c2d79
c5e3e64
6f1ac69
c5e3e64
 
6f1ac69
c5e3e64
2952de2
 
e8122f3
815db4b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess
import time

TRUST = True
SR = 16000


def resample(speech_array, sampling_rate):
    speech = torch.from_numpy(speech_array)
    print(speech, speech.shape, sampling_rate)
    resampler = torchaudio.transforms.Resample(sampling_rate)
    speech = resampler(speech).squeeze().numpy()
    return speech


def predict(speech_array, sampling_rate):
    speech = resample(speech_array, sampling_rate)
    print(speech, speech.shape)
    inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model.to(device)(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = {config.id2label[i]: round(float(score), 3) for i, score in enumerate(scores)}
    return outputs


config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


def recognize(audio):
    sr, audio_array = audio
    audio_array = audio_array.astype(np.float32)
    state = predict(audio_array, sr)
    return state


def test_some(audio):
    sr, audio_array = audio
    audio_array = audio_array.astype(np.float32)
    
    return (sr, audio_array)


interface = gr.Interface(
    fn=recognize, 
    inputs=[
        gr.Audio(source="microphone", label="Скажите что-нибудь...")
    ],
    outputs=[
        gr.Label(num_top_classes=7)
    ],
    live=False
    )
    
gr.TabbedInterface([interface], ["Russian Emotion Recognition"]).launch(debug=True)