Spaces:
Runtime error
Runtime error
import os | |
# from sys import path | |
# path.append(r"./pretrained_models") | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
# import torch | |
# import csv | |
# import json | |
# import math | |
# import os | |
# import struct | |
import torchaudio | |
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | |
# from pydub import AudioSegment | |
# import pyaudio | |
from speechbrain.pretrained.interfaces import foreign_class | |
# import scipy.io.wavfile as wav | |
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline | |
# import time | |
os.environ["no_proxy"] = "localhost,127.0.0.1,::1" | |
classifier = foreign_class( | |
source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP" | |
pymodule_file="custom_interface.py", | |
classname="CustomEncoderWav2vec2Classifier", | |
savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
) | |
ASR_model = ParaformerOffline() | |
vad = FSMNVad() | |
punc = CttPunctuator() | |
def classify_continuous(audio): | |
print(type(audio)) | |
print(audio) | |
sample_rate, signal = audio # 这是语音的输入 | |
signal = signal.astype(np.float32) | |
signal /= np.max(np.abs(signal)) | |
sf.write("a.wav", signal, sample_rate) | |
signal, sample_rate = torchaudio.load("a.wav") | |
signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)( | |
signal | |
) | |
torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16) | |
Audio = "out.wav" | |
speech, sample_rate = AudioReader.read_wav_file(Audio) | |
if signal == "none": | |
return "none", "none", "haha" | |
else: | |
segments = vad.segments_offline(speech) | |
text_results = "" | |
for part in segments: | |
_result = ASR_model.infer_offline( | |
speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开" | |
) | |
text_results += punc.punctuate(_result)[0] | |
out_prob, score, index, text_lab = classifier.classify_batch(signal1) | |
return text_results, out_prob.squeeze(0).numpy(), text_lab[-1] | |
demo = gr.Interface( | |
classify_continuous, | |
gr.Audio(sources=["microphone"]), | |
[ | |
gr.Text(label="语音识别结果"), | |
gr.Text(label="音频情感识别1"), | |
gr.Text(label="音频情感识别2"), | |
], | |
) | |
demo.launch() | |