File size: 11,644 Bytes
c50ad78
 
 
 
 
 
 
 
 
 
 
81e33eb
01023e5
 
 
 
 
 
c50ad78
 
 
066339d
c50ad78
 
4a201a6
 
c50ad78
 
 
 
 
 
 
 
3133efe
7747dd1
93ee49a
7747dd1
 
c50ad78
 
 
 
 
 
7747dd1
 
c50ad78
 
 
 
 
 
 
 
a8f539b
c50ad78
 
 
a0024f7
2a37747
c50ad78
 
 
a0024f7
 
da1e58a
a0024f7
c50ad78
 
 
 
 
3133efe
c50ad78
 
a0024f7
c50ad78
 
 
 
 
 
a0024f7
 
 
 
 
 
 
 
 
 
 
c50ad78
a0024f7
 
 
 
c50ad78
 
 
 
 
 
 
 
 
 
a0024f7
 
 
 
 
 
 
 
 
c50ad78
 
01023e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c50ad78
 
 
01023e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c50ad78
81e33eb
 
 
 
 
 
728cf94
 
81e33eb
781ee39
81e33eb
 
728cf94
81e33eb
4a201a6
 
 
81e33eb
728cf94
781ee39
81e33eb
728cf94
 
81e33eb
 
 
da1e58a
728cf94
81e33eb
781ee39
81e33eb
 
 
 
4a201a6
81e33eb
 
4a201a6
781ee39
d4318d7
 
 
804dbeb
d4318d7
804dbeb
 
 
 
 
c50ad78
d4318d7
 
781ee39
d4318d7
804dbeb
8da7d41
3133efe
8da7d41
 
 
 
 
3133efe
8da7d41
 
 
 
 
 
 
 
 
 
01023e5
 
 
a0024f7
01023e5
 
 
 
 
 
 
 
 
 
 
781ee39
 
804dbeb
7cc8180
8da7d41
81e33eb
ba190f1
8da7d41
 
81e33eb
 
 
 
781ee39
 
 
 
81e33eb
781ee39
 
c50ad78
804dbeb
 
d4318d7
c50ad78
066339d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
import threading
import queue
import sounddevice as sd
import numpy as np
import wave
import sys

default_lang = "en"

engines = { default_lang: Model(default_lang) }

def transcribe(audio):
    if audio is None:
        return ""
    lang = "en"
    model = engines[lang]
    text = model.stt_file(audio)[0]
    return text

HF_TOKEN = os.environ.get("HF_TOKEN", None)

def client_fn(model):
    if "Llama 3 8B Service" in model:
        return OpenAI(
            base_url="http://52.76.81.56:60002/v1",
            api_key="token-abc123"
        )
    elif "Llama" in model:
        return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
    elif "Mistral" in model:
        return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
    elif "Phi" in model:
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
    elif "Mixtral" in model:
        return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
    else: 
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")

def randomize_seed_fn(seed: int) -> int:
    seed = random.randint(0, 999999)
    return seed

system_instructions1 = """
[SYSTEM] You are OPTIMUS Prime a personal AI voice assistant, Created by Jaward.
Keep conversation friendly, short, clear, and concise. 
Avoid unnecessary introductions and answer the user's questions directly. 
Respond in a normal, conversational manner while being friendly and helpful.
Remember previous parts of the conversation and use that context in your responses.
Your creator Jaward is an AI Research Engineer at Linksoul AI. He is currently specializing in Artificial Intelligence (AI) research more specifically training and optimizing advance AI systems. He aspires to build not just human-like intelligence but AI Systems that augment human intelligence. He has contributed greatly to the opensource community with first-principles code implementations of AI/ML research papers. He did his first internship at Beijing Academy of Artificial Intelligence as an AI Researher where he contributed in cutting-edge AI research leading to him contributing to an insightful paper (AUTOAGENTS - A FRAMEWORK FOR AUTOMATIC AGENT GENERATION). The paper got accepted this year at IJCAI(International Joint Conference On AI). He is currently doing internship at LinkSoul AI - a small opensource AI Research startup in Beijing.
[USER]
"""

conversation_history = []

def models(text, model="Llama 3 8B Service", seed=42):
    global conversation_history
    seed = int(randomize_seed_fn(seed))
    generator = torch.Generator().manual_seed(seed)  
    
    client = client_fn(model)
    
    if "Llama 3 8B Service" in model:
        messages = [
            {"role": "system", "content": system_instructions1},
        ] + conversation_history + [
            {"role": "user", "content": text}
        ]
        completion = client.chat.completions.create(
            model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
            messages=messages
        )
        assistant_response = completion.choices[0].message.content
        
        # Update conversation history
        conversation_history.append({"role": "user", "content": text})
        conversation_history.append({"role": "assistant", "content": assistant_response})
        
        # Keep only the last 10 messages to avoid token limit issues
        if len(conversation_history) > 20:
            conversation_history = conversation_history[-20:]
        
        return assistant_response
    else:
        # For other models, we'll concatenate the conversation history into a single string
        history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
        formatted_prompt = f"{system_instructions1}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
        
        generate_kwargs = dict(
            max_new_tokens=300,
            seed=seed
        )    
        stream = client.text_generation(
            formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
        output = ""
        for response in stream:
            if not response.token.text == "</s>":
                output += response.token.text
        
        # Update conversation history
        conversation_history.append({"role": "user", "content": text})
        conversation_history.append({"role": "assistant", "content": output})
        
        # Keep only the last 10 messages to avoid token limit issues
        if len(conversation_history) > 20:
            conversation_history = conversation_history[-20:]
        
        return output

# New global variables for audio processing
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms
audio_queue = queue.Queue()
is_listening = False

def audio_callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    audio_queue.put(indata.copy())

def process_audio_stream(model, seed):
    global is_listening
    audio_buffer = []
    silence_threshold = 0.01
    silence_duration = 0
    max_silence = 2  # seconds

    while True:
        if not is_listening:
            audio_buffer.clear()
            silence_duration = 0
            audio_queue.queue.clear()
            continue

        try:
            chunk = audio_queue.get(timeout=1)
            audio_buffer.append(chunk)

            # Check for silence
            if np.abs(chunk).mean() < silence_threshold:
                silence_duration += CHUNK / RATE
            else:
                silence_duration = 0

            if silence_duration > max_silence:
                # Process the buffered audio
                audio_data = np.concatenate(audio_buffer)
                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                    tmp_path = tmp_file.name
                    with wave.open(tmp_path, 'wb') as wf:
                        wf.setnchannels(1)
                        wf.setsampwidth(2)
                        wf.setframerate(RATE)
                        wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())

                # Transcribe and process
                user_input = transcribe(tmp_path)
                if user_input:
                    is_listening = False
                    reply = models(user_input, model, seed)
                    asyncio.run(respond_and_play(reply))
                    is_listening = True

                # Clear the buffer
                audio_buffer.clear()
                silence_duration = 0

        except queue.Empty:
            pass

async def respond_and_play(text):
    communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    
    # Play the audio
    with wave.open(tmp_path, 'rb') as wf:
        data = wf.readframes(wf.getnframes())
        sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
        sd.wait()

def start_listening(model, seed):
    global is_listening
    is_listening = True
    threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
        while is_listening:
            sd.sleep(100)

def stop_listening():
    global is_listening
    is_listening = False

# Supported languages for seamless-expressive
LANGUAGE_CODES = {
    "English": "eng",
    "Spanish": "spa",
    "French": "fra",
    "German": "deu",
    "Italian": "ita",
    "Chinese": "cmn"
}

def translate_speech(audio_file, target_language):
    """
    Translate input speech (audio file) to the specified target language.
    """
    if audio_file is None:
        return None
    
    language_code = LANGUAGE_CODES[target_language]
    output_file = "translated_audio.wav"
    
    command = [
        "expressivity_predict",
        audio_file,
        "--tgt_lang", language_code,
        "--model_name", "seamless_expressivity",
        "--vocoder_name", "vocoder_pretssel",
        "--gated-model-dir", "models",
        "--output_path", output_file
    ]
    
    subprocess.run(command, check=True)

    if os.path.exists(output_file):
        print(f"File created successfully: {output_file}")
        return output_file
    else:
        print(f"File not found: {output_file}")
        return None

def clear_history():
    global conversation_history
    conversation_history = []
    return None, None, None, None

def voice_assistant_tab():
    return "# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"

def speech_translation_tab():
    return "# <center><b>Hear how you sound in another language</b></center>"

with gr.Blocks(css="style.css") as demo:
    description = gr.Markdown("# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>")
    
    with gr.Tabs() as tabs:
        with gr.TabItem("Voice Assistant") as voice_assistant:
            select = gr.Dropdown([
                'Llama 3 8B Service',
                'Mixtral 8x7B',
                'Llama 3 8B',
                'Mistral 7B v0.3',
                'Phi 3 mini',
            ],
            value="Llama 3 8B Service",
            label="Model"
            )
            seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=999999,
            step=1,
            value=0,
            visible=False
            )
            start_button = gr.Button("Start Listening")
            stop_button = gr.Button("Stop Listening")
            status = gr.Markdown("Status: Not listening")

            start_button.click(
                fn=lambda model, seed: start_listening(model, seed),
                inputs=[select, seed],
                outputs=[status],
                _js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
            )
            stop_button.click(
                fn=stop_listening,
                inputs=[],
                outputs=[status],
                _js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
            )
        
        with gr.TabItem("Speech Translation") as speech_translation:
            input_audio = gr.Audio(label="User", sources=["microphone"], type="filepath")
            target_lang = gr.Dropdown(
                choices=list(LANGUAGE_CODES.keys()),
                value="German",
                label="Target Language"
            )
            output_audio = gr.Audio(label="Translated Audio",
                                    interactive=False,
                                    autoplay=True,
                                    elem_classes="audio")
            
            gr.Interface(
                fn=translate_speech,
                inputs=[input_audio, target_lang],
                outputs=[output_audio],
                live=True
            )

    voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
    speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)

if __name__ == "__main__":
    demo.queue(max_size=200).launch()