Spaces:

KingNish
/

OpenGPT-4o

Running on Zero

App Files Files Community

OpenGPT-4o / voice_chat.py

KingNish

Update voice_chat.py

41b91df verified 5 months ago

raw

history blame

2.89 kB

	import gradio as gr
	import edge_tts
	import asyncio
	import tempfile
	import numpy as np
	import soxr
	from pydub import AudioSegment
	import torch
	import sentencepiece as spm
	import onnxruntime as ort
	from huggingface_hub import hf_hub_download, InferenceClient

	# Speech Recognition Model Configuration
	model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
	sample_rate = 16000

	# Download preprocessor, encoder and tokenizer
	preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
	encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
	tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))

	# Mistral Model Configuration
	client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
	system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"

	def resample(audio_fp32, sr):
	return soxr.resample(audio_fp32, sr, sample_rate)

	def to_float32(audio_buffer):
	return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)

	def transcribe(audio_path):
	audio_file = AudioSegment.from_file(audio_path)
	sr = audio_file.frame_rate
	audio_buffer = np.array(audio_file.get_array_of_samples())

	audio_fp32 = to_float32(audio_buffer)
	audio_16k = resample(audio_fp32, sr)

	input_signal = torch.tensor(audio_16k).unsqueeze(0)
	length = torch.tensor(len(audio_16k)).unsqueeze(0)
	processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)

	logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]

	blank_id = tokenizer.vocab_size()
	decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
	text = tokenizer.decode_ids(decoded_prediction)

	return text

	def model(text):
	formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
	stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
	return "".join([response.token.text for response in stream if response.token.text != "</s>"])

	async def respond(audio):
	user = transcribe(audio)
	reply = model(user)
	communicate = edge_tts.Communicate(reply)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)
	return tmp_path