Spaces:

ruslanmv
/

Llama-3.2-11B-Vision-Instruct

Running on Zero

ruslanmv commited on Oct 2

Commit

91c2468

•

1 Parent(s): d6028f5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -53,11 +53,7 @@ def predict(image, text):
     # Decode the output to return the final response
     response = processor.decode(outputs[0], skip_special_tokens=True)
-    # Format the conversation for a better appearance without repetition
-    formatted_response = f"User: {text}\n\nAssistant: {response}"
-    return formatted_response
 # Define the Gradio interface
 interface = gr.Interface(
@@ -66,7 +62,7 @@ interface = gr.Interface(
         gr.Image(type="pil", label="Image Input"),  # Image input with label
         gr.Textbox(label="Text Input")  # Textbox input with label
     ],
-    outputs=gr.Textbox(label="Conversation"),  # Output with a more descriptive label
     title="Llama 3.2 11B Vision Instruct Demo",  # Title of the interface
     description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.",  # Short description
     theme="compact"  # Using a compact theme for a cleaner look

     # Decode the output to return the final response
     response = processor.decode(outputs[0], skip_special_tokens=True)
+    return response
 # Define the Gradio interface
 interface = gr.Interface(
         gr.Image(type="pil", label="Image Input"),  # Image input with label
         gr.Textbox(label="Text Input")  # Textbox input with label
     ],
+    outputs=gr.Textbox(label="Generated Response"),  # Output with a more descriptive label
     title="Llama 3.2 11B Vision Instruct Demo",  # Title of the interface
     description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.",  # Short description
     theme="compact"  # Using a compact theme for a cleaner look