import os # from sys import path # path.append(r"./pretrained_models") import gradio as gr import numpy as np import soundfile as sf # import torch # import csv # import json # import math # import os # import struct import torchaudio # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # from pydub import AudioSegment # import pyaudio from speechbrain.pretrained.interfaces import foreign_class # import scipy.io.wavfile as wav from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline # import time os.environ["no_proxy"] = "localhost,127.0.0.1,::1" classifier = foreign_class( source="pretrained_models\\speechbrain\\emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier", savedir="pretrained_models\\speechbrain\\emotion-recognition-wav2vec2-IEMOCAP", ) ASR_model = ParaformerOffline() vad = FSMNVad() punc = CttPunctuator() def classify_continuous(audio): print(type(audio)) print(audio) sample_rate, signal = audio # 这是语音的输入 signal = signal.astype(np.float32) signal /= np.max(np.abs(signal)) sf.write("a.wav", signal, sample_rate) signal, sample_rate = torchaudio.load("a.wav") signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)( signal ) torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16) Audio = "out.wav" speech, sample_rate = AudioReader.read_wav_file(Audio) if signal == "none": return "none", "none", "haha" else: segments = vad.segments_offline(speech) text_results = "" for part in segments: _result = ASR_model.infer_offline( speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开" ) text_results += punc.punctuate(_result)[0] out_prob, score, index, text_lab = classifier.classify_batch(signal1) return text_results, out_prob.squeeze(0).numpy(), text_lab[-1] demo = gr.Interface( classify_continuous, gr.Audio(sources=["microphone"]), [ gr.Text(label="语音识别结果"), gr.Text(label="音频情感识别1"), gr.Text(label="音频情感识别2"), ], ) demo.launch()