File size: 2,364 Bytes
1464d1f
 
dee4ce5
 
1464d1f
 
 
 
f41fa37
1464d1f
 
 
 
 
 
 
 
 
 
 
 
3ae34aa
 
dee4ce5
1464d1f
 
 
 
e4e1f22
1464d1f
 
e4e1f22
1464d1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4e1f22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os

# from sys import path
# path.append(r"./pretrained_models")
import gradio as gr
import numpy as np
import soundfile as sf

# import torch
# import csv
# import json
# import math
# import os
# import struct
import torchaudio

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# from pydub import AudioSegment
# import pyaudio
from speechbrain.pretrained.interfaces import foreign_class

# import scipy.io.wavfile as wav
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline

# import time

os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
classifier = foreign_class(
    source="pretrained_models\\local-speechbrain\\emotion-recognition-wav2vec2-IEMOCAP",  # ".\\emotion-recognition-wav2vec2-IEMOCAP"
    pymodule_file="custom_interface.py",
    classname="CustomEncoderWav2vec2Classifier",
    savedir="pretrained_models\\local-speechbrain\\emotion-recognition-wav2vec2-IEMOCAP",
)
ASR_model = ParaformerOffline()
vad = FSMNVad()
punc = CttPunctuator()


def classify_continuous(audio):
    print(type(audio))
    print(audio)
    sample_rate, signal = audio  # 这是语音的输入
    signal = signal.astype(np.float32)
    signal /= np.max(np.abs(signal))
    sf.write("a.wav", signal, sample_rate)
    signal, sample_rate = torchaudio.load("a.wav")
    signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
        signal
    )
    torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
    Audio = "out.wav"
    speech, sample_rate = AudioReader.read_wav_file(Audio)
    if signal == "none":
        return "none", "none", "haha"
    else:
        segments = vad.segments_offline(speech)
        text_results = ""
        for part in segments:
            _result = ASR_model.infer_offline(
                speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
            )
            text_results += punc.punctuate(_result)[0]

        out_prob, score, index, text_lab = classifier.classify_batch(signal1)
        return text_results, out_prob.squeeze(0).numpy(), text_lab[-1]


demo = gr.Interface(
    classify_continuous,
    gr.Audio(sources=["microphone"]),
    [
        gr.Text(label="语音识别结果"),
        gr.Text(label="音频情感识别1"),
        gr.Text(label="音频情感识别2"),
    ],
)

demo.launch()