File size: 11,644 Bytes
c50ad78 81e33eb 01023e5 c50ad78 066339d c50ad78 4a201a6 c50ad78 3133efe 7747dd1 93ee49a 7747dd1 c50ad78 7747dd1 c50ad78 a8f539b c50ad78 a0024f7 2a37747 c50ad78 a0024f7 da1e58a a0024f7 c50ad78 3133efe c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 01023e5 c50ad78 01023e5 c50ad78 81e33eb 728cf94 81e33eb 781ee39 81e33eb 728cf94 81e33eb 4a201a6 81e33eb 728cf94 781ee39 81e33eb 728cf94 81e33eb da1e58a 728cf94 81e33eb 781ee39 81e33eb 4a201a6 81e33eb 4a201a6 781ee39 d4318d7 804dbeb d4318d7 804dbeb c50ad78 d4318d7 781ee39 d4318d7 804dbeb 8da7d41 3133efe 8da7d41 3133efe 8da7d41 01023e5 a0024f7 01023e5 781ee39 804dbeb 7cc8180 8da7d41 81e33eb ba190f1 8da7d41 81e33eb 781ee39 81e33eb 781ee39 c50ad78 804dbeb d4318d7 c50ad78 066339d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
import threading
import queue
import sounddevice as sd
import numpy as np
import wave
import sys
default_lang = "en"
engines = { default_lang: Model(default_lang) }
def transcribe(audio):
if audio is None:
return ""
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
HF_TOKEN = os.environ.get("HF_TOKEN", None)
def client_fn(model):
if "Llama 3 8B Service" in model:
return OpenAI(
base_url="http://52.76.81.56:60002/v1",
api_key="token-abc123"
)
elif "Llama" in model:
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
elif "Mixtral" in model:
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
else:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
system_instructions1 = """
[SYSTEM] You are OPTIMUS Prime a personal AI voice assistant, Created by Jaward.
Keep conversation friendly, short, clear, and concise.
Avoid unnecessary introductions and answer the user's questions directly.
Respond in a normal, conversational manner while being friendly and helpful.
Remember previous parts of the conversation and use that context in your responses.
Your creator Jaward is an AI Research Engineer at Linksoul AI. He is currently specializing in Artificial Intelligence (AI) research more specifically training and optimizing advance AI systems. He aspires to build not just human-like intelligence but AI Systems that augment human intelligence. He has contributed greatly to the opensource community with first-principles code implementations of AI/ML research papers. He did his first internship at Beijing Academy of Artificial Intelligence as an AI Researher where he contributed in cutting-edge AI research leading to him contributing to an insightful paper (AUTOAGENTS - A FRAMEWORK FOR AUTOMATIC AGENT GENERATION). The paper got accepted this year at IJCAI(International Joint Conference On AI). He is currently doing internship at LinkSoul AI - a small opensource AI Research startup in Beijing.
[USER]
"""
conversation_history = []
def models(text, model="Llama 3 8B Service", seed=42):
global conversation_history
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = client_fn(model)
if "Llama 3 8B Service" in model:
messages = [
{"role": "system", "content": system_instructions1},
] + conversation_history + [
{"role": "user", "content": text}
]
completion = client.chat.completions.create(
model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
messages=messages
)
assistant_response = completion.choices[0].message.content
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": assistant_response})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return assistant_response
else:
# For other models, we'll concatenate the conversation history into a single string
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
formatted_prompt = f"{system_instructions1}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
generate_kwargs = dict(
max_new_tokens=300,
seed=seed
)
stream = client.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "</s>":
output += response.token.text
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": output})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return output
# New global variables for audio processing
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
audio_queue = queue.Queue()
is_listening = False
def audio_callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
audio_queue.put(indata.copy())
def process_audio_stream(model, seed):
global is_listening
audio_buffer = []
silence_threshold = 0.01
silence_duration = 0
max_silence = 2 # seconds
while True:
if not is_listening:
audio_buffer.clear()
silence_duration = 0
audio_queue.queue.clear()
continue
try:
chunk = audio_queue.get(timeout=1)
audio_buffer.append(chunk)
# Check for silence
if np.abs(chunk).mean() < silence_threshold:
silence_duration += CHUNK / RATE
else:
silence_duration = 0
if silence_duration > max_silence:
# Process the buffered audio
audio_data = np.concatenate(audio_buffer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
with wave.open(tmp_path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(RATE)
wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
# Transcribe and process
user_input = transcribe(tmp_path)
if user_input:
is_listening = False
reply = models(user_input, model, seed)
asyncio.run(respond_and_play(reply))
is_listening = True
# Clear the buffer
audio_buffer.clear()
silence_duration = 0
except queue.Empty:
pass
async def respond_and_play(text):
communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
# Play the audio
with wave.open(tmp_path, 'rb') as wf:
data = wf.readframes(wf.getnframes())
sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
sd.wait()
def start_listening(model, seed):
global is_listening
is_listening = True
threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
while is_listening:
sd.sleep(100)
def stop_listening():
global is_listening
is_listening = False
# Supported languages for seamless-expressive
LANGUAGE_CODES = {
"English": "eng",
"Spanish": "spa",
"French": "fra",
"German": "deu",
"Italian": "ita",
"Chinese": "cmn"
}
def translate_speech(audio_file, target_language):
"""
Translate input speech (audio file) to the specified target language.
"""
if audio_file is None:
return None
language_code = LANGUAGE_CODES[target_language]
output_file = "translated_audio.wav"
command = [
"expressivity_predict",
audio_file,
"--tgt_lang", language_code,
"--model_name", "seamless_expressivity",
"--vocoder_name", "vocoder_pretssel",
"--gated-model-dir", "models",
"--output_path", output_file
]
subprocess.run(command, check=True)
if os.path.exists(output_file):
print(f"File created successfully: {output_file}")
return output_file
else:
print(f"File not found: {output_file}")
return None
def clear_history():
global conversation_history
conversation_history = []
return None, None, None, None
def voice_assistant_tab():
return "# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"
def speech_translation_tab():
return "# <center><b>Hear how you sound in another language</b></center>"
with gr.Blocks(css="style.css") as demo:
description = gr.Markdown("# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>")
with gr.Tabs() as tabs:
with gr.TabItem("Voice Assistant") as voice_assistant:
select = gr.Dropdown([
'Llama 3 8B Service',
'Mixtral 8x7B',
'Llama 3 8B',
'Mistral 7B v0.3',
'Phi 3 mini',
],
value="Llama 3 8B Service",
label="Model"
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=999999,
step=1,
value=0,
visible=False
)
start_button = gr.Button("Start Listening")
stop_button = gr.Button("Stop Listening")
status = gr.Markdown("Status: Not listening")
start_button.click(
fn=lambda model, seed: start_listening(model, seed),
inputs=[select, seed],
outputs=[status],
_js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
)
stop_button.click(
fn=stop_listening,
inputs=[],
outputs=[status],
_js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
)
with gr.TabItem("Speech Translation") as speech_translation:
input_audio = gr.Audio(label="User", sources=["microphone"], type="filepath")
target_lang = gr.Dropdown(
choices=list(LANGUAGE_CODES.keys()),
value="German",
label="Target Language"
)
output_audio = gr.Audio(label="Translated Audio",
interactive=False,
autoplay=True,
elem_classes="audio")
gr.Interface(
fn=translate_speech,
inputs=[input_audio, target_lang],
outputs=[output_audio],
live=True
)
voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
if __name__ == "__main__":
demo.queue(max_size=200).launch() |