Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -5,6 +5,7 @@ import numpy as np | |
| 5 | 
             
            from groq import Groq
         | 
| 6 | 
             
            import spaces
         | 
| 7 | 
             
            from transformers import AutoModel, AutoTokenizer
         | 
|  | |
| 8 | 
             
            from parler_tts import ParlerTTSForConditionalGeneration
         | 
| 9 | 
             
            import soundfile as sf
         | 
| 10 | 
             
            from llama_index.core.agent import ReActAgent
         | 
| @@ -15,20 +16,19 @@ from tavily import TavilyClient | |
| 15 | 
             
            import requests
         | 
| 16 | 
             
            from huggingface_hub import hf_hub_download
         | 
| 17 | 
             
            from safetensors.torch import load_file
         | 
| 18 | 
            -
            from diffusers import StableDiffusion3Pipeline
         | 
| 19 |  | 
| 20 | 
             
            # Initialize models and clients
         | 
| 21 | 
             
            MODEL = 'llama3-groq-70b-8192-tool-use-preview'
         | 
| 22 | 
             
            client = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
         | 
| 23 |  | 
| 24 | 
             
            vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
         | 
| 25 | 
            -
             | 
| 26 | 
             
            tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
         | 
| 27 |  | 
| 28 | 
             
            tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
         | 
| 29 | 
             
            tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
         | 
| 30 |  | 
| 31 | 
            -
            # Updated Image  | 
| 32 | 
             
            pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
         | 
| 33 | 
             
            pipe = pipe.to("cuda")
         | 
| 34 |  | 
| @@ -75,7 +75,7 @@ def image_generation(query): | |
| 75 | 
             
                image = pipe(
         | 
| 76 | 
             
                    query,
         | 
| 77 | 
             
                    negative_prompt="",
         | 
| 78 | 
            -
                    num_inference_steps= | 
| 79 | 
             
                    guidance_scale=7.0,
         | 
| 80 | 
             
                ).images[0]
         | 
| 81 | 
             
                image.save("output.jpg")
         | 
| @@ -111,12 +111,11 @@ def handle_input(user_prompt, image=None, audio=None, websearch=False): | |
| 111 | 
             
                    messages = [{"role": "user", "content": [image, user_prompt]}]
         | 
| 112 | 
             
                    response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
         | 
| 113 | 
             
                else:
         | 
| 114 | 
            -
                    # Modify this part to check if a tool is required or if a direct answer suffices
         | 
| 115 | 
             
                    response = agent.chat(user_prompt)
         | 
| 116 |  | 
| 117 | 
             
                # Extract the content from AgentChatResponse to return as a string
         | 
| 118 | 
             
                if isinstance(response, AgentChatResponse):
         | 
| 119 | 
            -
                    response = response. | 
| 120 |  | 
| 121 | 
             
                return response
         | 
| 122 |  | 
| @@ -189,4 +188,4 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea | |
| 189 |  | 
| 190 | 
             
            # Launch the UI
         | 
| 191 | 
             
            demo = create_ui()
         | 
| 192 | 
            -
            demo.launch()
         | 
|  | |
| 5 | 
             
            from groq import Groq
         | 
| 6 | 
             
            import spaces
         | 
| 7 | 
             
            from transformers import AutoModel, AutoTokenizer
         | 
| 8 | 
            +
            from diffusers import StableDiffusion3Pipeline
         | 
| 9 | 
             
            from parler_tts import ParlerTTSForConditionalGeneration
         | 
| 10 | 
             
            import soundfile as sf
         | 
| 11 | 
             
            from llama_index.core.agent import ReActAgent
         | 
|  | |
| 16 | 
             
            import requests
         | 
| 17 | 
             
            from huggingface_hub import hf_hub_download
         | 
| 18 | 
             
            from safetensors.torch import load_file
         | 
|  | |
| 19 |  | 
| 20 | 
             
            # Initialize models and clients
         | 
| 21 | 
             
            MODEL = 'llama3-groq-70b-8192-tool-use-preview'
         | 
| 22 | 
             
            client = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
         | 
| 23 |  | 
| 24 | 
             
            vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
         | 
| 25 | 
            +
                                                  device_map="auto", torch_dtype=torch.bfloat16)
         | 
| 26 | 
             
            tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
         | 
| 27 |  | 
| 28 | 
             
            tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
         | 
| 29 | 
             
            tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
         | 
| 30 |  | 
| 31 | 
            +
            # Updated Image generation model
         | 
| 32 | 
             
            pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
         | 
| 33 | 
             
            pipe = pipe.to("cuda")
         | 
| 34 |  | 
|  | |
| 75 | 
             
                image = pipe(
         | 
| 76 | 
             
                    query,
         | 
| 77 | 
             
                    negative_prompt="",
         | 
| 78 | 
            +
                    num_inference_steps=15,
         | 
| 79 | 
             
                    guidance_scale=7.0,
         | 
| 80 | 
             
                ).images[0]
         | 
| 81 | 
             
                image.save("output.jpg")
         | 
|  | |
| 111 | 
             
                    messages = [{"role": "user", "content": [image, user_prompt]}]
         | 
| 112 | 
             
                    response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
         | 
| 113 | 
             
                else:
         | 
|  | |
| 114 | 
             
                    response = agent.chat(user_prompt)
         | 
| 115 |  | 
| 116 | 
             
                # Extract the content from AgentChatResponse to return as a string
         | 
| 117 | 
             
                if isinstance(response, AgentChatResponse):
         | 
| 118 | 
            +
                    response = response.final_response # Use 'final_response' to access the text response
         | 
| 119 |  | 
| 120 | 
             
                return response
         | 
| 121 |  | 
|  | |
| 188 |  | 
| 189 | 
             
            # Launch the UI
         | 
| 190 | 
             
            demo = create_ui()
         | 
| 191 | 
            +
            demo.launch()
         | 
