Spaces:

LYL1015
/

JarvisArt-Preview

Running on Zero

App Files Files Community

LYL1015 commited on Jul 7

Commit

3910317

verified ·

1 Parent(s): 06d6661

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -25

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ from utils.lua_converter import LuaConverter
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
-from utils.lua2lrt import lua_to_lrtemplate
 from huggingface_hub import snapshot_download
 import spaces
@@ -670,8 +670,6 @@ def get_llm_response_with_custom_prompt_stream(image_path, user_prompt, max_new_
     except Exception as e:
         yield f"❌ Error during inference: {str(e)}"
-def process_upload(file):
-    return file
 def compact_text(text):
     """
@@ -697,10 +695,20 @@ def get_box_coordinates(annotated_image_dict, prompt_original):
     and format the bounding box coordinates.
     """
     global local_dict
-    if annotated_image_dict and annotated_image_dict["boxes"]:
         # Get the last drawn box
         input_image = annotated_image_dict["image"]
-        pil_image = Image.open(input_image)
         last_box = annotated_image_dict["boxes"][-1]
         width, height = pil_image.width, pil_image.height
@@ -708,13 +716,46 @@ def get_box_coordinates(annotated_image_dict, prompt_original):
         ymin = last_box["ymin"] / height
         xmax = last_box["xmax"] / width
         ymax = last_box["ymax"] / height
-        local_dict[input_image] = [xmin, ymin, xmax, ymax]
         # Format the coordinates into a string
-        return str([xmin, ymin, xmax, ymax]), " In the region <box></box>, xxx"
     return "No box drawn", prompt_original
 @spaces.GPU
 def process_analysis_pipeline_stream(image_dict, user_prompt, max_new_tokens, top_k, top_p, temperature):
     """
@@ -731,16 +772,29 @@ def process_analysis_pipeline_stream(image_dict, user_prompt, max_new_tokens, to
     Yields:
         list: Updated chat_history for Gradio UI updates (messages format)
     """
-    if image_dict is None:
         yield [
             {"role": "user", "content": "Please upload an image first! 📸"},
             {"role": "assistant", "content": "I need an image to analyze before I can provide editing recommendations."}
-        ]
         return
     image = image_dict['image']
     if not user_prompt.strip():
         user_prompt = default_user_prompt
-    elif len(local_dict) > 0 and local_dict[image][0] != local_dict[image][2]:
         user_prompt = user_prompt.replace('<box></box>', f'<box>{str(local_dict[image])}</box>')
@@ -1121,11 +1175,8 @@ def create_interface():
                 # Input image upload component
                 input_image = image_annotator(
                     label="📸 Upload Your Image & Draw Bounding Box",
-                    disable_edit_boxes=True,
-                    image_type="filepath",
-                    single_box=True,
-                    show_label=True,
-                    height=400
                 )
                 # Prompt input
@@ -1280,17 +1331,12 @@ def create_interface():
                     outputs=user_prompt
                 )
-        # Event binding
         input_image.change(
-            fn=get_box_coordinates,
-            inputs=[input_image, user_prompt],
-            outputs=[coordinates_output, user_prompt]
-        )
-        input_image.upload(
-            fn=process_upload,
-            inputs=[input_image],
-            outputs=[input_image]
         )
         # Main processing button - streaming output, pass all parameters
         process_btn.click(

 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
+from lua2lrt import lua_to_lrtemplate
 from huggingface_hub import snapshot_download
 import spaces
     except Exception as e:
         yield f"❌ Error during inference: {str(e)}"
 def compact_text(text):
     """
     and format the bounding box coordinates.
     """
     global local_dict
+    if annotated_image_dict and annotated_image_dict.get("boxes") and len(annotated_image_dict["boxes"]) > 0:
         # Get the last drawn box
         input_image = annotated_image_dict["image"]
+        # Handle both PIL Image and file path cases
+        if isinstance(input_image, str):
+            # If it's a file path
+            pil_image = Image.open(input_image)
+            image_key = input_image
+        else:
+            # If it's a PIL Image object
+            pil_image = input_image
+            image_key = str(input_image)  # Use string representation as key
         last_box = annotated_image_dict["boxes"][-1]
         width, height = pil_image.width, pil_image.height
         ymin = last_box["ymin"] / height
         xmax = last_box["xmax"] / width
         ymax = last_box["ymax"] / height
+        local_dict[image_key] = [xmin, ymin, xmax, ymax]
         # Format the coordinates into a string
+        return str([xmin, ymin, xmax, ymax]), f"In the region <box>{str([xmin, ymin, xmax, ymax])}</box>, {prompt_original}"
     return "No box drawn", prompt_original
+def get_box_coordinates_simple(annotated_image_dict):
+    """
+    Simplified version that matches test1.py pattern - only returns coordinates
+    """
+    global local_dict
+    if annotated_image_dict and annotated_image_dict.get("boxes") and len(annotated_image_dict["boxes"]) > 0:
+        # Get the last drawn box
+        input_image = annotated_image_dict["image"]
+        # Handle both PIL Image and file path cases
+        if isinstance(input_image, str):
+            # If it's a file path
+            pil_image = Image.open(input_image)
+            image_key = input_image
+        else:
+            # If it's a PIL Image object
+            pil_image = input_image
+            image_key = str(input_image)  # Use string representation as key
+        last_box = annotated_image_dict["boxes"][-1]
+        width, height = pil_image.width, pil_image.height
+        xmin = last_box["xmin"] / width
+        ymin = last_box["ymin"] / height
+        xmax = last_box["xmax"] / width
+        ymax = last_box["ymax"] / height
+        local_dict[image_key] = [xmin, ymin, xmax, ymax]
+        # Format the coordinates into a string
+        return str([xmin, ymin, xmax, ymax])
+    return "No bounding box drawn yet."
 @spaces.GPU
 def process_analysis_pipeline_stream(image_dict, user_prompt, max_new_tokens, top_k, top_p, temperature):
     """
     Yields:
         list: Updated chat_history for Gradio UI updates (messages format)
     """
+    if image_dict is None or image_dict.get('image') is None:
         yield [
             {"role": "user", "content": "Please upload an image first! 📸"},
             {"role": "assistant", "content": "I need an image to analyze before I can provide editing recommendations."}
+        ], None
         return
+    # Extract image from the image_dict
     image = image_dict['image']
+    # Handle the case where image is a PIL Image object - need to save it temporarily
+    if not isinstance(image, str):
+        import tempfile
+        import os
+        # Save PIL image to temporary file
+        temp_dir = tempfile.gettempdir()
+        temp_path = os.path.join(temp_dir, f"temp_image_{hash(str(image))}.png")
+        image.save(temp_path)
+        image = temp_path
     if not user_prompt.strip():
         user_prompt = default_user_prompt
+    elif len(local_dict) > 0 and image in local_dict and local_dict[image][0] != local_dict[image][2]:
         user_prompt = user_prompt.replace('<box></box>', f'<box>{str(local_dict[image])}</box>')
                 # Input image upload component
                 input_image = image_annotator(
                     label="📸 Upload Your Image & Draw Bounding Box",
+                    label_list=["region"],  # 添加标签列表
+                    use_default_label=True  # 自动使用第一个标签作为默认标签
                 )
                 # Prompt input
                     outputs=user_prompt
                 )
+        # Event binding - simplified to match test1.py working pattern
         input_image.change(
+            fn=get_box_coordinates_simple,
+            inputs=input_image,
+            outputs=coordinates_output
         )
         # Main processing button - streaming output, pass all parameters
         process_btn.click(