File size: 15,865 Bytes
06ecff4
 
 
 
 
 
 
 
 
 
83498a5
67b7ca7
 
 
 
06ecff4
 
 
 
 
 
 
 
 
83498a5
67b7ca7
 
 
 
 
 
 
06ecff4
 
 
 
 
 
 
67b7ca7
06ecff4
 
 
 
67b7ca7
 
06ecff4
 
 
 
 
 
83498a5
 
06ecff4
 
 
 
 
 
 
 
67b7ca7
06ecff4
adf8038
67b7ca7
47eda48
280d8c3
 
 
 
 
702b99d
67b7ca7
 
 
 
 
 
e8c9642
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06ecff4
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
77def59
67b7ca7
06ecff4
67b7ca7
 
 
06ecff4
 
 
67b7ca7
06ecff4
67b7ca7
 
 
 
 
 
06ecff4
67b7ca7
 
 
 
 
 
09e512e
adf8038
 
09e512e
adf8038
 
09e512e
67b7ca7
 
ff4edf1
67b7ca7
 
 
 
83498a5
67b7ca7
 
77def59
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
09e512e
adf8038
 
09e512e
adf8038
 
09e512e
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff4edf1
a580d61
67b7ca7
 
 
 
ff4edf1
67b7ca7
 
 
 
 
 
 
 
 
06ecff4
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06ecff4
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8c9642
67b7ca7
 
b2dc487
67b7ca7
 
06ecff4
 
9f4b706
67b7ca7
 
 
 
 
 
9f4b706
67b7ca7
 
06ecff4
67b7ca7
 
 
e6925fa
67b7ca7
 
9f4b706
 
67b7ca7
 
 
 
 
06ecff4
 
e8c9642
67b7ca7
 
06ecff4
 
67b7ca7
06ecff4
e8c9642
 
 
06ecff4
 
67b7ca7
 
06ecff4
67b7ca7
3ceac67
67b7ca7
 
 
 
 
 
 
 
 
 
 
 
 
e8c9642
67b7ca7
 
 
e8c9642
 
 
06ecff4
67b7ca7
 
 
 
e8c9642
 
 
67b7ca7
 
 
 
e8c9642
 
67b7ca7
 
 
 
 
 
 
e8c9642
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import speech_recognition as sr
from gtts import gTTS
import gradio as gr
from io import BytesIO
import numpy as np
from dataclasses import dataclass, field
import time
from pydub import AudioSegment
import librosa
from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
from PIL import Image
from ClassPrompt import PromptClass
import render

creator_prompt = PromptClass()
r = sr.Recognizer()

@dataclass
class AppState:
    stream: np.ndarray | None = None
    sampling_rate: int = 0
    pause_detected: bool = False
    started_talking: bool =  False
    stopped: bool = False
    history: list = field(default_factory=list)
    typing: bool = False
    painting:bool = False
    image_out:Image.Image = None
    image_in:Image = None
    conversation:list = field(default_factory=list)
    recording: bool = False  # Thêm thuộc tính recording
    pause_threshold: float = 1  # Thêm thuộc tính pause_threshold

def run_vad(ori_audio, sr):
    _st = time.time()
    try:
        audio = ori_audio
        audio = audio.astype(np.float32) / 32768.0
        sampling_rate = 16000
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
        vad_parameters = {}
        vad_parameters = VadOptions(**vad_parameters)
        speech_chunks = get_speech_timestamps(audio, vad_parameters)
        audio = collect_chunks(audio, speech_chunks)
        duration_after_vad = audio.shape[0] / sampling_rate # Khai báo và tính toán duration_after_vad
        vad_audio = audio
        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
        vad_audio_bytes = vad_audio.tobytes()
        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
    except Exception as e:
        return -1, ori_audio, round(time.time() - _st, 4)

def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
    """Phát hiện tạm dừng trong âm thanh."""
    temp_audio = audio
    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
    duration = len(audio) / sampling_rate
    if dur_vad > 0.5 and not state.started_talking:
        print("started talking")
        state.started_talking = True
        return False
    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
    return (duration - dur_vad) > state.pause_threshold # Sử dụng state.pause_threshold

def process_audio(audio:tuple,state:AppState,image:Image):
    if state.recording:  # Kiểm tra state.stream:
        if state.stream is not None:
            try:
                state.stream = np.concatenate((state.stream, audio[1]))
            except Exception as e:
                print(f"Lỗi tổng hợp giọng nói: {e}")
                return state, None
        else:
            state.stream = audio[1]
            state.sampling_rate = audio[0]
        state.image_in=image
        pause_detected = determine_pause(state.stream, state.sampling_rate, state)
        state.pause_detected = pause_detected
        if state.pause_detected and state.started_talking:
            state.started_talking = False
            state.recording = False
            return state, gr.Audio(recording=False)
    return state, None
  
def transcribe_audio(audio_segment):
    audio_buffer = BytesIO()
    audio_segment.export(audio_buffer, format="wav")
    audio_buffer.seek(0)
    try:
        with sr.AudioFile(audio_buffer) as source:
            r.adjust_for_ambient_noise(source)
            text = r.recognize_google(r.record(source), language='vi')
            return text
    except sr.UnknownValueError:
        print("Could not understand audio.")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
    return ""

def chat_with_onlinemodel(user_input, state:AppState):
    state.history.append({"role": "user", "content": user_input})
    response = creator_prompt.chat(provider="SambaNova", model="Meta-Llama-3.1-405B-Instruct", input_text=state.history)
    bot_response = response
    characters = bot_response.replace("*","")
    state.history.append({"role": "assistant", "content": characters})
    state.conversation.append({"role": "user", "content":"Bạn: " + user_input})
    state.conversation.append({"role": "assistant", "content":"Bot: " + characters})
    return characters, state

def synthesize_speech(text): 
    """Chuyển đổi text sang giọng nói bằng gTTS."""
    try:
        mp3 = gTTS(text, tld='com.vn', lang='vi', slow=False)
        mp3_fp = BytesIO()
        mp3.write_to_fp(mp3_fp)
        audio_bytes = mp3_fp.getvalue()
        mp3_fp.close()
        return audio_bytes # Chỉ trả về audio_bytes
    except Exception as e:
        print(f"Lỗi tổng hợp giọng nói: {e}")
        return None

def response_audio(state:AppState, progress=gr.Progress(track_tqdm=True)):
    """Xử lý yêu cầu và tạo phản hồi."""
    if not state.pause_detected and not state.started_talking:
        return state, None
    textin=""
    audio_segment = AudioSegment(
        state.stream.tobytes(),
        frame_rate=state.sampling_rate,
        sample_width=state.stream.dtype.itemsize,
        channels=1 if state.stream.ndim == 1 else state.stream.shape[1]
    )
    textin = transcribe_audio(audio_segment) 
    state.stream = None
    if state.typing is False:
        txt,state = chuyen_trangthai(textin, state)
        if txt == True:
            return state, synthesize_speech("chuyển sang trạng thái dùng bàn phím")
    if textin != "":
        paint=state.painting
        state.painting = text_check(textin, state.painting)
        if paint != state.painting:
            return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện"))
        if state.painting is True:
            promptx = prompt_hugingface(textin,"Hugging Face","Qwen/Qwen2.5-72B-Instruct","Medium")
            imgtxt=""
            if state.image_in:
                img=resize(state.image_in)
                imgtxt = creator_prompt.img2text(img)
            else:
                img=None
            state.image_out = render.generate_images(imgtxt+promptx,img,progress)
            audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+textin+" có đẹp không")
            return state, audio_bytes
        else:
            print("Đang nghĩ...")
            text_out, state = chat_with_onlinemodel(textin,state)
            audio_bytes = synthesize_speech(text_out)
            return state, audio_bytes
    else:
        return state, synthesize_speech("Tôi nghe không rõ") # Trả về thông báo lỗi nếu synthesize_speech thất bại

def response_text(state:AppState,textin,image:Image, prompt,progress=gr.Progress(track_tqdm=True)):
    """Xử lý yêu cầu và tạo phản hồi."""
    #state.recording = False  # Dừng ghi âm
    if state.typing is True:
        txt,state = chuyen_trangthai(textin, state)
        if txt == False:
            return state, synthesize_speech("chuyển sang trạng thái nói")
    if textin != "":
        paint=state.painting
        state.painting = text_check(textin, state.painting)
        if paint != state.painting:
            return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện"))
        if state.painting is True:
            state.conversation.append({"role": "user", "content":"Bạn: " + textin})
            #state.image_out = generate_image(textin, image, streng, ckpt,guidance)
            imgtxt=""
            if image:
                img=resize(image)
                imgtxt = creator_prompt.img2text(img)
            else:
                img=None
            image_out = render.generate_images(imgtxt+textin,img,progress)
            state.image_out = image_out
            audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+prompt+" có đẹp không")
            return state, audio_bytes
        else:
            print("Đang nghĩ...")
            text_out, state = chat_with_onlinemodel(textin,state=state)
            audio_bytes = synthesize_speech(text_out)
            return state, audio_bytes
    else:
        return state, synthesize_speech("Hãy gõ nội dung") # Trả về thông báo lỗi nếu synthesize_speech thất bại

def text_check(textin, painting):
    if not painting:
        return "sang chế độ vẽ" in textin
    return "sang chế độ nói" not in textin

def chuyen_trangthai(textin, state:AppState):
    if "muốn nói chuyện" in textin:
        state.started_talking = False
        state.recording = True
        state.stopped=False
        state.typing = False
        return False, state
    elif "dùng bàn phím" in textin:
        state.started_talking = False
        state.recording = False
        state.stopped=True
        state.typing = True
        return True, state
    else:
        return state.typing, state
    
def start_recording_user(state:AppState):  # Sửa lỗi tại đây
    state.stopped = False # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording
    state.started_talking = False
    state.recording = True
    return gr.Audio(recording=True), state
    
def restart_recording(state:AppState):  # Sửa lỗi tại đây
    if not state.stopped: # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording
        state.started_talking = False
        state.recording = True
        return gr.Audio(recording=True), state
    else:
        state.started_talking = False
        state.recording = False
        return gr.Audio(recording=False), state

def prompt_hugingface(prompt,llm_provider,model,type):
    result = creator_prompt.generate(
                    input_text=prompt,
                    long_talk=True,
                    compress=True,
                    compression_level="hard",
                    poster=False,
                    prompt_type=type,  # Use the updated prompt_type here
                    custom_base_prompt="",
                    provider=llm_provider,
                    model=model
                )
    output = result
    return output

def resize(img:Image.Image):
    height = (img.height // 8) * 8  
    width = (img.width // 8) * 8    
    imgre = img.resize((width,height))
    return imgre    
    
loaded = ""
steps = 50

def update_model_choices(provider):
    provider_models = {
        "Hugging Face": [
            "Qwen/Qwen2.5-72B-Instruct",
            "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "mistralai/Mistral-7B-Instruct-v0.3"
        ],
        "SambaNova": [
            "Meta-Llama-3.1-70B-Instruct",
            "Meta-Llama-3.1-405B-Instruct",
            "Meta-Llama-3.1-8B-Instruct"
        ],
    }
    models = provider_models.get(provider, [])
    return gr.Dropdown(choices=models, value=models[0] if models else "")
prompt_types = ["Long", "Short", "Medium", "OnlyObjects", "NoFigure", "Landscape", "Fantasy"]
title = "Chat tiếng việt by tuphamkts"
description = "Muốn vẽ nói: Chuyển sang chế độ vẽ. Muốn chat nói: Chuyển sang chế độ nói. Chế độ gõ: Tôi muốn dùng bàn phím, chế độ nói: Tôi muốn nói chuyện. Ghi chú: Chỉ dừng chương trình khi tôi đang nói (lịch sử chat sẽ bị xóa khi dừng chương trình)."
examples = ["Chuyển sang chế độ vẽ","Chuyển sang chế độ nói","Tôi nuốn nói chuyện","Tôi muốn dùng bàn phím"]
with gr.Blocks(title=title) as demo:
    gr.HTML(f"<div style='text-align: center;'><h1>{title}</h1><p>{description}</p></div>")
    with gr.Row():
        with gr.Column():
            with gr.Column(visible=False) as prompt_visible:
                with gr.Row():
                    llm_provider = gr.Dropdown(choices=["Hugging Face", "SambaNova"], label="Nguồn model", value="Hugging Face")
                    model = gr.Dropdown(label="Chọn Model", choices=["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3.1-70B-Instruct","mistralai/Mixtral-8x7B-Instruct-v0.1","mistralai/Mistral-7B-Instruct-v0.3"], value="Qwen/Qwen2.5-72B-Instruct")
                    prompt_type = gr.Dropdown(choices=prompt_types, label="Phong cách", value="Medium", interactive=True)
                input_prompt = gr.Textbox(label="Nhập nội dung muốn vẽ",value="Một cô gái", type="text"),
                generate_prompt = gr.Button("Tạo Prompt", variant="stop")
            with gr.Column(visible=False) as typing_visible:
                input_text = gr.Textbox(label="Nhập nội dung trao đổi", type="text"),
                submit = gr.Button("Áp dụng", variant="stop")
            input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
            output_audio = gr.Audio(label="Trợ lý", autoplay=True, sources=None,type="numpy")
            input_image = gr.Image(label="Hình ảnh của bạn", sources=["upload","clipboard","webcam"], type="pil",visible=True)
        with gr.Column(visible=False) as image_visible:
            output_image = gr.Image(label="Hình ảnh sau xử lý", sources=None, type="pil",visible=True)     
        with gr.Column(visible=True) as chatbot_visible:
            chatbot = gr.Chatbot(label="Nội dung trò chuyện",type="messages")
    state = gr.State(value=AppState())
    #state = gr.State(value=AppState(typing=True, painting=True))
    startrecord = input_audio.start_recording(
        start_recording_user,
        [state],
        [input_audio, state],
    )
    stream = input_audio.stream(
        process_audio,
        [input_audio,state,input_image],
        [state,input_audio],
        stream_every=1,
        time_limit=30,
    )

    respond = input_audio.stop_recording(
        response_audio,
        [state],
        [state, output_audio],
    )
    respond.then(lambda s: s.conversation, [state], [chatbot])
    respond.then(lambda s: s.image_out, [state], [output_image])
    
    restart = output_audio.stop(
        restart_recording,
        [state],
        [input_audio, state],
    )
    restart.then(lambda s: gr.update(visible= not s.typing, recording = not s.typing), [state], [input_audio])
    restart.then(lambda s: gr.update(visible=s.typing), [state], [typing_visible])
    restart.then(lambda s: gr.update(visible=s.painting), [state], [image_visible])
    restart.then(lambda s: gr.update(visible=(s.painting and s.typing) if s.painting==True else False), [state], [prompt_visible])
    restart.then(lambda s: gr.update(visible= not s.painting), [state], [chatbot_visible])
    
    cancel = gr.Button("Dừng chương trình", variant="stop", interactive=False)
    stream.then(lambda s: gr.update(interactive= not s.stopped), [state], [cancel])
    cancel.click(
        lambda: (AppState(stopped=True, recording=False, started_talking = False), gr.Audio(recording=False), gr.update(interactive=False)), 
        None,[state, input_audio, cancel],
        cancels=[respond, stream, startrecord, restart]  # Thêm startrecord và stream vào cancels
    )
    
    sub = submit.click(
        response_text, 
        [state, input_text[0], input_image, input_prompt[0]],
        [state, output_audio], 
    )
    sub.then(lambda s: s.conversation, [state], [chatbot])
    sub.then(lambda s: s.image_out, [state], [output_image])
    
    generator = generate_prompt.click(
        prompt_hugingface, 
        [input_prompt[0],llm_provider,model,prompt_type],
        [input_text[0]]  
    )
    
    llm_provider.change(
        update_model_choices, 
        [llm_provider], 
        [model]
    )
    gr.Examples(
        examples=examples,
        inputs=input_text,
    )
    
if __name__ == "__main__":
    demo.launch()