Liusuthu's picture
Upload folder using huggingface_hub
890de26 verified
raw
history blame
2.36 kB
import os
# from sys import path
# path.append(r"./pretrained_models")
import gradio as gr
import numpy as np
import soundfile as sf
# import torch
# import csv
# import json
# import math
# import os
# import struct
import torchaudio
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# from pydub import AudioSegment
# import pyaudio
from speechbrain.pretrained.interfaces import foreign_class
# import scipy.io.wavfile as wav
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline
# import time
os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
classifier = foreign_class(
source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP"
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
)
ASR_model = ParaformerOffline()
vad = FSMNVad()
punc = CttPunctuator()
def classify_continuous(audio):
print(type(audio))
print(audio)
sample_rate, signal = audio # 这是语音的输入
signal = signal.astype(np.float32)
signal /= np.max(np.abs(signal))
sf.write("a.wav", signal, sample_rate)
signal, sample_rate = torchaudio.load("a.wav")
signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
signal
)
torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
Audio = "out.wav"
speech, sample_rate = AudioReader.read_wav_file(Audio)
if signal == "none":
return "none", "none", "haha"
else:
segments = vad.segments_offline(speech)
text_results = ""
for part in segments:
_result = ASR_model.infer_offline(
speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
)
text_results += punc.punctuate(_result)[0]
out_prob, score, index, text_lab = classifier.classify_batch(signal1)
return text_results, out_prob.squeeze(0).numpy(), text_lab[-1]
demo = gr.Interface(
classify_continuous,
gr.Audio(sources=["microphone"]),
[
gr.Text(label="语音识别结果"),
gr.Text(label="音频情感识别1"),
gr.Text(label="音频情感识别2"),
],
)
demo.launch()