Spaces:

phamngoctukts
/

assistant

Sleeping

App Files Files Community

phamngoctukts commited on Nov 24

Commit

67b7ca7

•

1 Parent(s): 4b2dda3

Upload 3 files

Browse files

Files changed (3) hide show

ClassPrompt.py +305 -0
app.py +287 -144
render.py +94 -0

ClassPrompt.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import random  # Import the random module
+from groq import Groq
+from openai import OpenAI
+import os
+class PromptClass:
+    def __init__(self):
+        self.huggingface_token = os.environ.get("HF_TOKEN")
+        self.groq_api_key = os.environ.get("GROQ_TOKEN")
+        self.sambanova_api_key = os.environ.get("SAMBANOVA_TOKEN")
+        self.huggingface_client = OpenAI(
+            base_url="https://api-inference.huggingface.co/v1/",
+            api_key=self.huggingface_token,
+        )
+        self.groq_client = Groq(api_key=self.groq_api_key)
+        self.sambanova_client = OpenAI(
+            api_key=self.sambanova_api_key,
+            base_url="https://api.sambanova.ai/v1",
+        )
+        self.download_models()
+    def download_models(self):
+        from huggingface_hub import hf_hub_download
+        hf_hub_download(
+            repo_id="stabilityai/stable-diffusion-3.5-large",
+            filename="mmdit.png",
+            local_dir = "./models",
+            token = self.huggingface_token
+        )
+        hf_hub_download(
+            repo_id="stabilityai/stable-diffusion-3.5-large-turbo",
+            filename="LICENSE.md",
+            local_dir = "./models",
+            token = self.huggingface_token
+        )
+    def generate_prompt(self, dynamic_seed, prompt_type, custom_input):
+        """
+        Generates a prompt based on the provided seed, prompt type, and custom input.
+        """
+        random.seed(dynamic_seed)
+        if custom_input and custom_input.strip():
+            prompt = custom_input
+        else:
+            prompt = f"Create a random prompt based on the '{prompt_type}' type."
+        # Additional logic can be added here if needed
+        print(f"Generated prompt: {prompt}")  # Debug statement
+        return prompt
+    def generate(
+        self,
+        input_text,
+        long_talk,
+        compress,
+        compression_level,
+        poster,
+        prompt_type,
+        custom_base_prompt="",
+        provider="Hugging Face",
+        model=None,
+    ):
+        try:
+            # Define prompts
+            default_long_prompt = """Create a detailed visually descriptive caption of this description,
+which will be used as a prompt for a text to image AI system (caption only, no instructions like "create an image").
+Remove any mention of digital artwork or artwork style. Give detailed visual descriptions of the character(s), including ethnicity, skin tone, expression etc.
+Imagine using keywords for a still for someone who has aphantasia. Describe the image style, e.g., any photographic or art styles/techniques utilized.
+Make sure to fully describe all aspects of the cinematography, with abundant technical details and visual descriptions.
+If there is more than one image, combine the elements and characters from all of the images creatively into a single
+cohesive composition with a single background, inventing an interaction between the characters.
+Be creative in combining the characters into a single cohesive scene.
+Focus on two primary characters (or one) and describe an interesting interaction between them, such as a hug, a kiss, a fight, giving an object,
+an emotional reaction/interaction. If there is more than one background in the images, pick the most appropriate one.
+Your output is only the caption itself, no comments or extra formatting.
+The caption is in a single long paragraph.
+If you feel the images are inappropriate, invent a new scene/characters inspired by these.
+Additionally, incorporate a specific movie director's visual style and describe the lighting setup in detail,
+including the type, color, and placement of light sources to create the desired mood and atmosphere.
+Always frame the scene, including details about the film grain, color grading, and any artifacts or characteristics specific."""
+            default_simple_prompt = """Create a brief, straightforward caption for this description, suitable for a text-to-image AI system.
+Focus on the main elements, key characters, and overall scene without elaborate details.
+Provide a clear and concise description in one or two sentences. Your output is only the caption itself, no comments or extra formatting.
+The caption is in a single long paragraph."""
+            poster_prompt = """Analyze the provided description and extract key information to create a movie poster style description. Format the output as follows:
+Title: A catchy, intriguing title that captures the essence of the scene, place the title in "".
+Main character: Give a description of the main character.
+Background: Describe the background in detail.
+Supporting characters: Describe the supporting characters.
+Branding type: Describe the branding type.
+Tagline: Include a tagline that captures the essence of the movie.
+Visual style: Ensure that the visual style fits the branding type and tagline.
+You are allowed to make up film and branding names, and do them like 80's, 90's or modern movie posters.
+Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
+            only_objects_prompt = """Create a highly detailed and visually rich description focusing solely on inanimate objects,
+without including any human or animal figures. Describe the objects' shapes, sizes, colors, textures, and materials in great detail.
+Pay attention to their arrangement, positioning, and how they interact with light and shadow. Include information about the setting
+or environment these objects are in, such as indoor/outdoor, time of day, weather conditions, and any atmospheric effects.
+Mention any unique features, patterns, or imperfections on the objects. Describe the overall composition, perspective, and
+any artistic techniques that might be employed to render these objects (e.g., photorealism, impressionistic style, etc.).
+Your description should paint a vivid picture that allows someone to imagine the scene without seeing it, focusing on the beauty,
+complexity, or significance of everyday objects. Your output is only the caption itself, no comments or extra formatting.
+The caption is in a single long paragraph."""
+            no_figure_prompt = """Generate a comprehensive and visually evocative description of a scene
+or landscape without including any human or animal figures. Focus on the environment, natural elements, and man-made structures if present.
+Describe the topography, vegetation, weather conditions, and time of day in great detail.
+Pay attention to colors, textures, and how light interacts with different elements of the scene.
+If there are buildings or other structures, describe their architecture, condition, and how they fit into the landscape.
+Include sensory details beyond just visual elements - mention sounds, smells, and the overall atmosphere or mood of the scene.
+Describe any notable features like bodies of water, geological formations, or sky phenomena.
+Consider the perspective from which the scene is viewed and how this affects the composition.
+Your description should transport the reader to this location, allowing them to vividly imagine the scene without any living subjects present.
+ Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
+            landscape_prompt = """Create an immersive and detailed description of a landscape,
+focusing on its natural beauty and geographical features.
+Begin with the overall topography - is it mountainous, coastal, forested, desert, or a combination?
+Describe the horizon and how land meets sky. Detail the vegetation, noting types of trees, flowers, or grass,
+and how they're distributed across the landscape. Include information about any water features -
+rivers, lakes, oceans - and how they interact with the land. Describe the sky, including cloud formations,
+color gradients, and any celestial bodies visible.
+Pay attention to the quality of light, time of day, and season, explaining how these factors affect the colors and shadows in the scene.
+Include details about weather conditions and how they impact the landscape.
+Mention any geological features like rock formations, cliffs, or unique land patterns.
+If there are any distant man-made elements, describe how they integrate with the natural setting.
+Your description should capture the grandeur and mood of the landscape,
+allowing the reader to feel as if they're standing within this awe-inspiring natural scene.
+Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
+            fantasy_prompt = """Craft an extraordinarily detailed and imaginative description of a fantasy scene,
+blending elements of magic, otherworldly creatures, and fantastical environments. Begin by setting the overall tone -
+is this a dark and foreboding realm, a whimsical fairytale setting, or an epic high-fantasy world?
+Describe the landscape, including any impossible or magical geographical features like floating islands,
+crystal forests, or rivers of starlight. Detail the flora and fauna,
+focusing on fantastical plants and creatures that don't exist in our world.
+Include descriptions of any structures or ruins, emphasizing their otherworldly architecture and magical properties.
+Describe the sky and any celestial bodies, considering how they might differ from our reality.
+Include details about the presence of magic - how it manifests visually,
+its effects on the environment, and any magical phenomena occurring in the scene.
+If there are characters present, describe their appearance, focusing on non-human features, magical auras, or
+fantastical clothing and accessories. Pay attention to colors, textures, and light sources,
+especially those that couldn't exist in the real world. Your description should transport the
+reader to a realm of pure imagination, where the laws of physics and nature as we know them don't apply.
+Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
+            prompt_types = {
+                "Long": default_long_prompt,
+                "Short": default_simple_prompt,
+                "Medium": poster_prompt,
+                "OnlyObjects": only_objects_prompt,
+                "NoFigure": no_figure_prompt,
+                "Landscape": landscape_prompt,
+                "Fantasy": fantasy_prompt,
+            }
+            # Determine the base prompt
+            print(f"Received prompt_type: '{prompt_type}'")  # Debug print
+            if prompt_type == "Random":
+                prompt_type = random.choice(list(prompt_types.keys()))
+                print(f"Randomly selected prompt type: {prompt_type}")
+            if prompt_type and prompt_type.strip() and prompt_type in prompt_types:
+                base_prompt = prompt_types[prompt_type]
+                print(f"Using {prompt_type} prompt")
+            elif custom_base_prompt.strip():
+                base_prompt = custom_base_prompt
+                print("Using custom base prompt")
+            else:
+                base_prompt = default_long_prompt
+                print(f"Warning: Unknown or empty prompt type '{prompt_type}'. Using default long prompt.")
+            # Handle compression if applicable
+            if compress and not poster:
+                compression_chars = {
+                    "soft": 600 if long_talk else 300,
+                    "medium": 400 if long_talk else 200,
+                    "hard": 200 if long_talk else 100,
+                }
+                char_limit = compression_chars.get(compression_level, 200)
+                base_prompt += f" Compress the output to be concise while retaining key visual details. MAX OUTPUT SIZE no more than {char_limit} characters."
+            # Construct messages for the LLM
+            system_message = "You are a helpful assistant. Try your best to give the best response possible to the user."
+            if input_text.startswith("Create a random prompt based on"):
+                user_message = f"Create a random description based on this\nInstructions: {base_prompt}"
+            else:
+                user_message = f"{base_prompt}\nDescription: {input_text}"
+            # Generate a random seed
+            seed = random.randint(0, 10000)
+            print(f"Generated seed: {seed}")  # Debug print
+            # Select the appropriate provider
+            if provider == "Hugging Face":
+                response = self.huggingface_client.chat.completions.create(
+                    model=model or "meta-llama/Meta-Llama-3.1-70B-Instruct",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    top_p=0.95,
+                    messages=[
+                        {"role": "system", "content": system_message},
+                        {"role": "user", "content": user_message},
+                    ],
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            elif provider == "Groq":
+                response = self.groq_client.chat.completions.create(
+                    model=model or "llama-3.1-70b-versatile",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    messages=[
+                        {"role": "system", "content": system_message},
+                        {"role": "user", "content": user_message},
+                    ],
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            elif provider == "SambaNova":
+                response = self.sambanova_client.chat.completions.create(
+                    model=model or "Meta-Llama-3.1-70B-Instruct",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    messages=[
+                        {"role": "system", "content": system_message},
+                        {"role": "user", "content": user_message},
+                    ],
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            else:
+                raise ValueError(f"Unsupported provider: {provider}")
+            # Clean up the output if necessary
+            if ": " in output:
+                output = output.split(": ", 1)[1].strip()
+            elif output.lower().startswith("here"):
+                sentences = output.split(". ")
+                if len(sentences) > 1:
+                    output = ". ".join(sentences[1:]).strip()
+            return output
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return f"Error occurred while processing the request: {str(e)}"
+    def chat(self,provider="Hugging Face",model=None,input_text=None):
+        seed = random.randint(0, 10000)
+        if input_text != "":
+            # Select the appropriate provider
+            if provider == "Hugging Face":
+                response = self.huggingface_client.chat.completions.create(
+                    model=model or "meta-llama/Meta-Llama-3.1-70B-Instruct",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    top_p=0.95,
+                    messages=input_text,
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            elif provider == "Groq":
+                response = self.groq_client.chat.completions.create(
+                    model=model or "llama-3.1-70b-versatile",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    messages=[
+                        {"role": "system", "content": "You are a helpful assistant"},
+                        {"role": "user", "content": input_text},
+                    ],
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            elif provider == "SambaNova":
+                response = self.sambanova_client.chat.completions.create(
+                    model=model or "Meta-Llama-3.1-70B-Instruct",
+                    max_tokens=1024,
+                    temperature=1.0,
+                    messages=[
+                        {"role": "system", "content": "You are a helpful assistant"},
+                        {"role": "user", "content": input_text},
+                    ],
+                    seed=seed  # Pass the seed parameter
+                )
+                output = response.choices[0].message.content.strip()
+            else:
+                raise ValueError(f"Unsupported provider: {provider}")
+            # Clean up the output if necessary
+            if ": " in output:
+                output = output.split(": ", 1)[1].strip()
+            elif output.lower().startswith("here"):
+                sentences = output.split(". ")
+                if len(sentences) > 1:
+                    output = ". ".join(sentences[1:]).strip()
+            return output

app.py CHANGED Viewed

@@ -5,29 +5,36 @@ from io import BytesIO
 import numpy as np
 from dataclasses import dataclass, field
 import time
-import traceback
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
-from huggingface_hub import InferenceClient
 import os
 from PIL import Image
-from threading import Thread
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 r = sr.Recognizer()
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
-    image: dict = field(default_factory=dict)
     sampling_rate: int = 0
     pause_detected: bool = False
     started_talking: bool =  False
     stopped: bool = False
-    message: dict = field(default_factory=dict)
     history: list = field(default_factory=list)
-    conversation: list = field(default_factory=list)
-    textout: str = ""
 def run_vad(ori_audio, sr):
     _st = time.time()
@@ -35,24 +42,17 @@ def run_vad(ori_audio, sr):
         audio = ori_audio
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
-        if sr != sampling_rate:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
         vad_parameters = {}
         vad_parameters = VadOptions(**vad_parameters)
         speech_chunks = get_speech_timestamps(audio, vad_parameters)
         audio = collect_chunks(audio, speech_chunks)
-        duration_after_vad = audio.shape[0] / sampling_rate
-        if sr != sampling_rate:
-            # resample to original sampling rate
-            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
-        else:
-            vad_audio = audio
         vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
         vad_audio_bytes = vad_audio.tobytes()
         return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
     except Exception as e:
-        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
-        print(msg)
         return -1, ori_audio, round(time.time() - _st, 4)
 def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
@@ -65,157 +65,300 @@ def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
         state.started_talking = True
         return False
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
-    return (duration - dur_vad) > 1
-def process_audio(audio:tuple, image: Image, state:AppState):
-    if audio is None:
-        print("Lỗi: audio là None. Kiểm tra nguồn âm thanh.")
-        # Xử lý lỗi, ví dụ: thoát chương trình hoặc sử dụng giá trị mặc định cho audio
-    else:
-        try:
-            if state.stream is None:
-                state.stream = audio[1]
-                state.sampling_rate = audio[0]
-            else:
-                state.stream =  np.concatenate((state.stream, audio[1]))
-        except IndexError:
-            print("Lỗi: Chỉ mục vượt quá giới hạn của audio. Kiểm tra kích thước của audio.")
-    if image is None:
-        state.image = {"file":""}
-    else:
-        state.image = {"file":str(image)}
-    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
-    state.pause_detected = pause_detected
-    if state.pause_detected and state.started_talking:
-        return gr.Audio(recording=False), state
-    return None, state
-def response(state:AppState = AppState()):
-    max_new_tokens = 1024
     if not state.pause_detected and not state.started_talking:
-        return None, AppState()
-    audio_buffer = BytesIO()
-    segment = AudioSegment(
         state.stream.tobytes(),
         frame_rate=state.sampling_rate,
         sample_width=state.stream.dtype.itemsize,
-        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
     )
-    segment.export(audio_buffer, format="wav")
-    textin = ""
-    with sr.AudioFile(audio_buffer) as source:
-        audio_data=r.record(source)
-        try:
-            textin=r.recognize_google(audio_data,language='vi')
-        except:
-            textin = ""
-        #state.conversation.append({"role": "user", "content": "Bạn: " + textin})
-    textout = ""
     if textin != "":
-        print("Đang nghĩ...")
-        state.message = {}
-        state.message={"text": textin,"files": state.image["file"]}
-        # phần phiên dịch
-        txt = state.message["text"]
-        messages= []
-        images = []
-        for i, msg in enumerate(state.history):
-            if isinstance(msg[0], tuple):
-                messages.append({"role": "user", "content": [{"type": "text", "text": state.history[i][0]}, {"type": "image"}]})
-                messages.append({"role": "assistant", "content": [{"type": "text", "text": state.history[i][1]}]})
-                images.append(Image.open(msg[0][0]).convert("RGB"))
-            elif isinstance(state.history[i], tuple) and isinstance(msg[0], str):
-                # messages are already handled
-                pass
-            elif isinstance(state.history[i][0], str) and isinstance(msg[0], str): # text only turn
-                messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
-                messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-        # add current message
-        if state.message["files"] != "": # examples
-            image = Image.open(state.message["files"]).convert("RGB")
-            images.append(image)
-            messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
-        else: # regular input
-            messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
-        token = ""
-        try:
-            for message in client.chat_completion(
-            messages,
-            max_tokens=max_new_tokens,
-            stream=True,
-            temperature=1.0,
-            top_p=0.9,
-            ):
-                token += message.choices[0].delta.content
-            textout=token
-        except:
-            print("Chưa lấy được thông tin dịch")
-        if state.message["files"] != "":
-            state.history.append([(txt,state.image["file"]),textout])
-            state.conversation.append({"role":"user","content":"Bạn: " + str(txt) + str(state.image["file"])})
-            state.conversation.append({"role":"assistant", "content": "Bot: " + str(textout)})
         else:
-            state.history.append([txt,textout])
-            state.conversation.append({"role": "user", "content":"Bạn: " + str(txt)})
-            state.conversation.append({"role": "assistant", "content":"Bot: " + str(textout)})
     else:
-        textout = "Tôi không nghe rõ"
-    #phần đọc chữ đã dịch
-    ssr = state.stream.tobytes()
-    print("Đang đọc...")
-    try:
-        mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
-        mp3_fp = BytesIO()
-        mp3.write_to_fp(mp3_fp)
-        srr=mp3_fp.getvalue()
-    except:
-        print("Lỗi không đọc được")
-    finally:
-        mp3_fp.close()
-    yield srr, AppState(conversation=state.conversation, history=state.history)
-def start_recording_user(state:AppState):  # Sửa lỗi tại đây
-    if not state.stopped:
-        return gr.Audio(recording=True)
-title = "vietnamese by tuphamkts"
-description = "A vietnamese text-to-speech demo."
-with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
-            input_image = gr.Image(label="Hình ảnh của bạn", sources="upload", type="filepath")
-        with gr.Column():
-            chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
-            output_audio = gr.Audio(label="Trợ lý", autoplay=True)
-    with gr.Row():
-        output_image = gr.Image(label="Hình ảnh sau xử lý", sources="clipboard", type="filepath",visible=False)
-    state = gr.State(value=AppState())
     stream = input_audio.stream(
         process_audio,
-        [input_audio, input_image, state],
-        [input_audio, state],
-        stream_every=0.50,
         time_limit=30,
     )
     respond = input_audio.stop_recording(
-        response,
-        [state],
-        [output_audio, state],
     )
     respond.then(lambda s: s.conversation, [state], [chatbot])
-    #respond.then(lambda s: s.image, [state], [output_image])
     restart = output_audio.stop(
-        start_recording_user,
         [state],
-        [input_audio],
     )
-    cancel = gr.Button("Stop Conversation", variant="stop")
-    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
-                [state, input_audio], cancels=[respond, restart])
-demo.launch()

 import numpy as np
 from dataclasses import dataclass, field
 import time
 from pydub import AudioSegment
 import librosa
 from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
+from huggingface_hub import login, hf_hub_download
 import os
 from PIL import Image
+from ClassPrompt import PromptClass
+import render
+creator_prompt = PromptClass()
 r = sr.Recognizer()
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
     pause_detected: bool = False
     started_talking: bool =  False
     stopped: bool = False
     history: list = field(default_factory=list)
+    typing: bool = False
+    painting:bool = False
+    image_out:Image.Image = None
+    image_in:Image = None
+    conversation:list = field(default_factory=list)
+    recording: bool = False  # Thêm thuộc tính recording
+    pause_threshold: float = 1  # Thêm thuộc tính pause_threshold
+    strength: float = 1.0
+    ckpt:list = field(default_factory=list)
+    guidance: float = 8
 def run_vad(ori_audio, sr):
     _st = time.time()
         audio = ori_audio
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
         vad_parameters = {}
         vad_parameters = VadOptions(**vad_parameters)
         speech_chunks = get_speech_timestamps(audio, vad_parameters)
         audio = collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate # Khai báo và tính toán duration_after_vad
+        vad_audio = audio
         vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
         vad_audio_bytes = vad_audio.tobytes()
         return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
     except Exception as e:
         return -1, ori_audio, round(time.time() - _st, 4)
 def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool:
         state.started_talking = True
         return False
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+    return (duration - dur_vad) > state.pause_threshold # Sử dụng state.pause_threshold
+def process_audio(audio:tuple,state:AppState,image:Image, streng:float,ckpt,guidance):
+    if state.recording:  # Kiểm tra state.stream:
+        time.sleep(0.1)
+        if state.stream is None:
+            state.stream = audio[1]
+            state.sampling_rate = audio[0]
+        else:
+            state.stream = np.concatenate((state.stream, audio[1]))
+        state.image_in=image
+        state.strength=streng
+        state.ckpt=ckpt
+        state.guidance=guidance
+        pause_detected = determine_pause(state.stream, state.sampling_rate, state)
+        state.pause_detected = pause_detected
+        if state.pause_detected and state.started_talking:
+            state.recording = False
+            return state, gr.Audio(recording=False)
+    return state, None
+def transcribe_audio(audio_segment):
+    audio_buffer = BytesIO()
+    audio_segment.export(audio_buffer, format="wav")
+    audio_buffer.seek(0)
+    try:
+        with sr.AudioFile(audio_buffer) as source:
+            r.adjust_for_ambient_noise(source)
+            text = r.recognize_google(r.record(source), language='vi')
+            return text
+    except sr.UnknownValueError:
+        print("Could not understand audio.")
+    except sr.RequestError as e:
+        print(f"Could not request results from Google Speech Recognition service; {e}")
+    return ""
+def chat_with_onlinemodel(user_input, state:AppState):
+    state.history.append({"role": "user", "content": user_input})
+    response = creator_prompt.chat(provider="SambaNova", model="Meta-Llama-3.1-405B-Instruct", input_text=state.history)
+    bot_response = response
+    characters = bot_response.replace("*","")
+    state.history.append({"role": "assistant", "content": characters})
+    state.conversation.append({"role": "user", "content":"Bạn: " + user_input})
+    state.conversation.append({"role": "assistant", "content":"Bot: " + characters})
+    return characters, state
+def synthesize_speech(text):
+    """Chuyển đổi text sang giọng nói bằng gTTS."""
+    try:
+        mp3 = gTTS(text, tld='com.vn', lang='vi', slow=False)
+        mp3_fp = BytesIO()
+        mp3.write_to_fp(mp3_fp)
+        audio_bytes = mp3_fp.getvalue()
+        mp3_fp.close()
+        return audio_bytes # Chỉ trả về audio_bytes
+    except Exception as e:
+        print(f"Lỗi tổng hợp giọng nói: {e}")
+        return None
+def response_audio(state:AppState):
+    """Xử lý yêu cầu và tạo phản hồi."""
     if not state.pause_detected and not state.started_talking:
+        return state, None
+    textin=""
+    audio_segment = AudioSegment(
         state.stream.tobytes(),
         frame_rate=state.sampling_rate,
         sample_width=state.stream.dtype.itemsize,
+        channels=1 if state.stream.ndim == 1 else state.stream.shape[1]
     )
+    textin = transcribe_audio(audio_segment)
+    state.stream = None
+    if state.typing is False:
+        txt,state = chuyen_trangthai(textin, state)
+        if txt == True:
+            return state, synthesize_speech("chuyển sang trạng thái dùng bàn phím")
     if textin != "":
+        paint=state.painting
+        state.painting = text_check(textin, state.painting)
+        if paint != state.painting:
+            return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện"))
+        if state.painting is True:
+            promptx = prompt_hugingface(textin,"Hugging Face","Qwen/Qwen2.5-72B-Instruct","Medium")
+            img=resize(state.image_in)
+            state.image_out = render.generate_images(textin, img)
+            audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+textin+" có đẹp không")
+            return state, audio_bytes
         else:
+            print("Đang nghĩ...")
+            text_out, state = chat_with_onlinemodel(textin,state)
+            audio_bytes = synthesize_speech(text_out)
+            return state, audio_bytes
     else:
+        return state, synthesize_speech("Tôi nghe không rõ") # Trả về thông báo lỗi nếu synthesize_speech thất bại
+def response_text(state:AppState,textin,image:Image,streng:float,ckpt, prompt,guidance,progress=gr.Progress(track_tqdm=True)):
+    """Xử lý yêu cầu và tạo phản hồi."""
+    #state.recording = False  # Dừng ghi âm
+    if state.typing is True:
+        txt,state = chuyen_trangthai(textin, state)
+        if txt == False:
+            return state, synthesize_speech("chuyển sang trạng thái nói")
+    if textin != "":
+        paint=state.painting
+        state.painting = text_check(textin, state.painting)
+        if paint != state.painting:
+            return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện"))
+        if state.painting is True:
+            state.conversation.append({"role": "user", "content":"Bạn: " + textin})
+            #state.image_out = generate_image(textin, image, streng, ckpt,guidance)
+            img=resize(image)
+            image_out = render.generate_images(textin, img)
+            state.image_out = image_out
+            audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+prompt+" có đẹp không")
+            return state, audio_bytes
+        else:
+            print("Đang nghĩ...")
+            text_out, state = chat_with_onlinemodel(textin,state=state)
+            audio_bytes = synthesize_speech(text_out)
+            return state, audio_bytes
+    else:
+        return state, synthesize_speech("Hãy gõ nội dung") # Trả về thông báo lỗi nếu synthesize_speech thất bại
+def text_check(textin, painting):
+    if not painting:
+        return "sang chế độ vẽ" in textin
+    return "sang chế độ nói" not in textin
+def chuyen_trangthai(textin, state:AppState):
+    if "muốn nói chuyện" in textin:
+        state.started_talking = False
+        state.recording = True
+        state.stopped=False
+        state.typing = False
+        return False, state
+    elif "dùng bàn phím" in textin:
+        state.started_talking = False
+        state.recording = False
+        state.stopped=True
+        state.typing = True
+        return True, state
+    else:
+        return state.typing, state
+def start_recording_user(state:AppState,progress=gr.Progress(track_tqdm=True)):  # Sửa lỗi tại đây
+    state.stopped = False # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording
+    state.started_talking = False
+    state.recording = True
+    return gr.Audio(recording=True), state
+def restart_recording(state:AppState):  # Sửa lỗi tại đây
+    if not state.stopped: # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording
+        state.started_talking = False
+        state.recording = True
+        return gr.Audio(recording=True), state
+    else:
+        state.started_talking = False
+        state.recording = False
+        return gr.Audio(recording=False), state
+def prompt_hugingface(prompt,llm_provider,model,type):
+    result = creator_prompt.generate(
+                    input_text=prompt,
+                    long_talk=True,
+                    compress=True,
+                    compression_level="hard",
+                    poster=False,
+                    prompt_type=type,  # Use the updated prompt_type here
+                    custom_base_prompt="",
+                    provider=llm_provider,
+                    model=model
+                )
+    output = result
+    return output
+def resize(img:Image.Image):
+    height = (img.height // 8) * 8
+    width = (img.width // 8) * 8
+    imgre = img.resize((width,height))
+    return imgre
+loaded = ""
+steps = 50
+def update_model_choices(provider):
+    provider_models = {
+        "Hugging Face": [
+            "Qwen/Qwen2.5-72B-Instruct",
+            "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "mistralai/Mistral-7B-Instruct-v0.3"
+        ],
+        "SambaNova": [
+            "Meta-Llama-3.1-70B-Instruct",
+            "Meta-Llama-3.1-405B-Instruct",
+            "Meta-Llama-3.1-8B-Instruct"
+        ],
+    }
+    models = provider_models.get(provider, [])
+    return gr.Dropdown(choices=models, value=models[0] if models else "")
+title = "Chat tiếng việt by tuphamkts"
+description = "Muốn vẽ nói: Chuyển sang chế độ vẽ. Muốn chat nói: Chuyển sang chế độ nói. Chế độ gõ: Tôi muốn dùng bàn phím, chế độ nói: Tôi muốn nói chuyện. Ghi chú: Chỉ dừng chương trình khi tôi đang nói (lịch sử chat sẽ bị xóa khi dừng chương trình)."
+examples = ["Chuyển sang chế độ vẽ","Chuyển sang chế độ nói"]
+with gr.Blocks(title=title) as demo:
+    gr.HTML(f"<div style='text-align: center;'><h1>{title}</h1><p>{description}</p></div>")
     with gr.Row():
         with gr.Column():
+            with gr.Column(visible=True) as prompt_visible:
+                with gr.Row():
+                    llm_provider = gr.Dropdown(choices=["Hugging Face", "SambaNova"], label="Nguồn model", value="Hugging Face")
+                    model = gr.Dropdown(label="Chọn Model", choices=["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3.1-70B-Instruct","mistralai/Mixtral-8x7B-Instruct-v0.1","mistralai/Mistral-7B-Instruct-v0.3"], value="Qwen/Qwen2.5-72B-Instruct")
+                    prompt_types = ["Long", "Short", "Medium", "OnlyObjects", "NoFigure", "Landscape", "Fantasy"]
+                    prompt_type = gr.Dropdown(choices=prompt_types, label="Phong cách", value="Medium", interactive=True)
+                input_prompt = gr.Textbox(label="Nhập nội dung muốn vẽ",value="Một cô gái", type="text"),
+                generate_prompt = gr.Button("Tạo Prompt", variant="stop")
+            with gr.Column(visible=True) as typing_visible:
+                input_text = gr.Textbox(label="Nhập nội dung trao đổi", type="text"),
+                submit = gr.Button("Áp dụng", variant="stop")
             input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
+            output_audio = gr.Audio(label="Trợ lý", autoplay=True, sources=None,type="numpy")
+            input_image = gr.Image(label="Hình ảnh của bạn", sources=["upload","clipboard","webcam"], type="pil",visible=True)
+        with gr.Column(visible=False) as image_visible:
+            ckpt = gr.Dropdown(label='Chọn mô hình',choices=['Chất lượng cao -> Tốc độ chậm', 'Chất lượng vừa -> Tốc độ vừa', 'Chất lượng kém -> Tốc độ nhanh'], value='Chất lượng kém -> Tốc độ nhanh', interactive=True, visible=True)
+            output_image = gr.Image(label="Hình ảnh sau xử lý", sources=None, type="pil",visible=True)
+            streng = gr.Slider(minimum=0.1, maximum=1, value=.8, step=0.05, label='Strength Lora')
+            guidance = gr.Slider(minimum=0.1, maximum=12, value=4, step=0.1, label='Sáng tạo')
+        with gr.Column(visible=True) as chatbot_visible:
+            chatbot = gr.Chatbot(label="Nội dung trò chuyện",type="messages")
+    #state = gr.State(value=AppState())
+    state = gr.State(value=AppState(typing=True, painting=True))
+    startrecord = input_audio.start_recording(
+        start_recording_user,
+        [state],
+        [input_audio, state],
+    )
     stream = input_audio.stream(
         process_audio,
+        [input_audio, state,input_image,streng,ckpt,guidance],
+        [state,input_audio],
+        stream_every=1,
         time_limit=30,
     )
     respond = input_audio.stop_recording(
+        fn=response_audio,
+        inputs=[state],
+        outputs=[state, output_audio],
     )
     respond.then(lambda s: s.conversation, [state], [chatbot])
+    respond.then(lambda s: s.image_out, [state], [output_image])
     restart = output_audio.stop(
+        restart_recording,
         [state],
+        [input_audio, state],
+    )
+    restart.then(lambda s: gr.update(visible= not s.typing, recording = not s.typing), [state], [input_audio])
+    restart.then(lambda s: gr.update(visible=s.typing), [state], [typing_visible])
+    restart.then(lambda s: gr.update(visible=s.painting), [state], [image_visible])
+    restart.then(lambda s: gr.update(visible=(s.painting and s.typing) if s.painting==True else False), [state], [prompt_visible])
+    restart.then(lambda s: gr.update(visible= not s.painting), [state], [chatbot_visible])
+    cancel = gr.Button("Dừng chương trình", variant="stop", interactive=False)
+    stream.then(lambda s: gr.update(interactive= not s.stopped), [state], [cancel])
+    cancel.click(
+        lambda: (AppState(stopped=True, recording=False, started_talking = False), gr.Audio(recording=False), gr.update(interactive=False)),
+        None,[state, input_audio, cancel],
+        cancels=[respond, stream, startrecord]  # Thêm startrecord và stream vào cancels
+    )
+    sub = submit.click(
+        fn=response_text,
+        inputs=[state, input_text[0], input_image, streng, ckpt, input_prompt[0],guidance],
+        outputs=[state, output_audio]
     )
+    sub.then(lambda s: s.conversation, [state], [chatbot])
+    sub.then(lambda s: s.image_out, [state], [output_image])
+    generator = generate_prompt.click(
+        fn=prompt_hugingface,
+        inputs=[input_prompt[0],llm_provider,model,prompt_type],
+        outputs=[input_text[0]]
+    )
+    llm_provider.change(
+        update_model_choices,
+        inputs=[llm_provider],
+        outputs=[model]
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=input_text,
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

render.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import websocket  # websocket-client
+import uuid
+import json
+import urllib.request
+import urllib.parse
+import random
+from PIL import Image
+import io
+from termcolor import colored
+import base64
+import io
+import os
+server_address = os.environ.get("URL_API")
+json_data=os.environ.get("JSON_API")
+client_id = str(uuid.uuid4())
+def queue_prompt(prompt):
+    p = {"prompt": prompt, "client_id": client_id}
+    data = json.dumps(p, indent=4).encode('utf-8')  # Prettify JSON for print
+    req = urllib.request.Request(f"http://{server_address}/prompt", data=data)
+    return json.loads(urllib.request.urlopen(req).read())
+def get_image(filename, subfolder, folder_type):
+    data = {"filename": filename, "subfolder": subfolder, "type": folder_type}
+    url_values = urllib.parse.urlencode(data)
+    with urllib.request.urlopen(f"http://{server_address}/view?{url_values}") as response:
+        return response.read()
+def get_history(prompt_id):
+    print(colored(f"Fetching history for prompt ID: {prompt_id}.", "cyan"))
+    with urllib.request.urlopen(f"http://{server_address}/history/{prompt_id}") as response:
+        return json.loads(response.read())
+def get_images(ws, prompt):
+    prompt_id = queue_prompt(prompt)['prompt_id']
+    output_images = {}
+    last_reported_percentage = 0
+    while True:
+        out = ws.recv()
+        if isinstance(out, str):
+            message = json.loads(out)
+            if message['type'] == 'progress':
+                data = message['data']
+                current_progress = data['value']
+                max_progress = data['max']
+                percentage = int((current_progress / max_progress) * 100)
+                if percentage >= last_reported_percentage + 10:
+                    print(colored(f"Progress: {percentage}% in node {data['node']}", "yellow"))
+                    last_reported_percentage = percentage
+            elif message['type'] == 'executing':
+                data = message['data']
+                if data['node'] is None and data['prompt_id'] == prompt_id:
+                    print(colored("Execution complete.", "green"))
+                    break  # Execution is done
+        else:
+            continue  # Previews are binary data
+    history = get_history(prompt_id)[prompt_id]
+    for o in history['outputs']:
+        for node_id in history['outputs']:
+            node_output = history['outputs'][node_id]
+            if 'images' in node_output:
+                images_output = []
+                for image in node_output['images']:
+                    print(colored(f"Downloading image: {image['filename']} from the server.", "yellow"))
+                    image_data = get_image(image['filename'], image['subfolder'], image['type'])
+                    images_output.append(image_data)
+                output_images[node_id] = images_output
+    return output_images
+def pil_to_base64(image):
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    base64_string=base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return f"data:image/png;base64,{base64_string}"
+def generate_images(positive_prompt, image):
+    ws = websocket.WebSocket()
+    ws_url = f"ws://{server_address}/ws?clientId={client_id}"
+    ws.connect(ws_url)
+    data = json.loads(json_data)
+    data["49"]["inputs"]["text"] = positive_prompt
+    data["90"]["inputs"]["images"]["base64"] = [pil_to_base64(image)]
+    seed = random.randint(1, 1000000000)
+    data["47"]["inputs"]["noise_seed"] = seed
+    images = get_images(ws, data)
+    ws.close()
+    for node_id in images:
+        for image_data in images[node_id]:
+            image = Image.open(io.BytesIO(image_data))
+    return image