Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Sleeping

App Files Files Community

ChaseHan commited on Jul 15

Commit

1d94259

verified ·

1 Parent(s): de9cba4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -28

app.py CHANGED Viewed

@@ -14,8 +14,8 @@ MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"
 MODEL_ENHANCED_NAME = "Latex2Layout-Enhanced"
 MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-RL"
-# --- NEW: Add a name for the Mixing mode ---
-MODEL_MIXING_NAME = "Mixing (Base + Enhanced)"
 MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME, MODEL_MIXING_NAME]
@@ -46,8 +46,6 @@ DEFAULT_PROMPT = (
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
-    # WARNING: Loading two 3B models without quantization requires a large amount of VRAM (>12 GB).
-    # This may fail on hardware with insufficient memory.
     print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
@@ -62,42 +60,84 @@ try:
         device_map="auto"
     )
-    # Processor is the same for both models
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
     print("All models loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
-# --- NEW: Helper function to calculate Intersection over Union ---
 def calculate_iou(boxA, boxB):
     """Calculate Intersection over Union (IoU) of two bounding boxes."""
-    # Determine the coordinates of the intersection rectangle
     xA = max(boxA[0], boxB[0])
     yA = max(boxA[1], boxB[1])
     xB = min(boxA[2], boxB[2])
     yB = min(boxA[3], boxB[3])
-    # Compute the area of intersection
     interArea = max(0, xB - xA) * max(0, yB - yA)
-    # Compute the area of both bounding boxes
     boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
     boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
-    # Compute the area of union
     unionArea = float(boxAArea + boxBArea - interArea)
-    # Return the IoU
     return interArea / unionArea if unionArea > 0 else 0
-# --- 3. Core Inference and Visualization Function (MODIFIED) ---
-@GPU
-def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)):
     """
-    Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
-    Supports running a single model or mixing results from two models.
     """
     if input_image is None:
         return None, "Please upload an image first."
@@ -105,7 +145,6 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
     image_resized = input_image.resize(TARGET_SIZE)
     image_rgba = image_resized.convert("RGBA")
-    # --- Nested helper function to run inference on a given model ---
     def run_inference(model_to_run, model_name_desc):
         progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
         messages = [{"role": "user", "content": [{"type": "image", "image": image_rgba}, {"type": "text", "text": prompt}]}]
@@ -123,20 +162,17 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
             json_str = json_match.group(1).strip() if json_match else raw_text.strip()
             return json.loads(json_str), raw_text
         except (json.JSONDecodeError, AttributeError):
-            return None, raw_text # Return raw text on failure for debugging
-    # --- Main logic: single model or mixing ---
     if selected_model_name == MODEL_MIXING_NAME:
         base_results, raw_text_base = run_inference(model_base, "Base Model")
         enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
         output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"
         if base_results is None or enhanced_results is None:
             return image_rgba.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"
-        # Merge results based on IoU
-        progress(0.8, desc="Merging results from both models...")
         merged_results = list(base_results)
         base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]
@@ -154,14 +190,18 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         results = merged_results
     else:
-        # Run a single model
         model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
         results, output_text = run_inference(model, selected_model_name)
         if results is None:
             return image_rgba.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
     # --- Visualization ---
-    progress(0.9, desc="Parsing and visualizing final results...")
     overlay = Image.new('RGBA', image_rgba.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
@@ -219,7 +259,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
             choices=MODEL_CHOICES,
             value=MODEL_BASE_NAME,
             label="Select Model",
-            info="Choose which model to use for inference. 'Mixing' combines the results of both."
         )
         prompt_textbox = gr.Textbox(
             label="Prompt",

 MODEL_ENHANCED_NAME = "Latex2Layout-Enhanced"
 MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-RL"
+# Add a name for the Mixing mode
+MODEL_MIXING_NAME = "Mixing Beta Version(Powerful Mode)"
 MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME, MODEL_MIXING_NAME]
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
     print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         device_map="auto"
     )
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
     print("All models loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
+# --- Helper functions for geometric calculations ---
 def calculate_iou(boxA, boxB):
     """Calculate Intersection over Union (IoU) of two bounding boxes."""
     xA = max(boxA[0], boxB[0])
     yA = max(boxA[1], boxB[1])
     xB = min(boxA[2], boxB[2])
     yB = min(boxA[3], boxB[3])
     interArea = max(0, xB - xA) * max(0, yB - yA)
     boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
     boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
     unionArea = float(boxAArea + boxBArea - interArea)
     return interArea / unionArea if unionArea > 0 else 0
+def calculate_intersection_area(boxA, boxB):
+    """Calculate the absolute intersection area of two bounding boxes."""
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    return max(0, xB - xA) * max(0, yB - yA)
+# --- NEW: Function to remove nested elements of the same type ---
+def remove_nested_elements(results):
     """
+    Removes smaller elements that are heavily nested within larger elements of the same label.
+    An element is considered nested if >80% of its area is inside the other.
     """
+    indices_to_remove = set()
+    for i in range(len(results)):
+        for j in range(len(results)):
+            if i == j:
+                continue
+            item_i = results[i]
+            item_j = results[j]
+            # Rule only applies to elements with the same label
+            if item_i.get("label") != item_j.get("label"):
+                continue
+            bbox_i = item_i.get("bbox_2d")
+            bbox_j = item_j.get("bbox_2d")
+            if not bbox_i or not bbox_j:
+                continue
+            area_i = (bbox_i[2] - bbox_i[0]) * (bbox_i[3] - bbox_i[1])
+            area_j = (bbox_j[2] - bbox_j[0]) * (bbox_j[3] - bbox_j[1])
+            if area_i == 0 or area_j == 0:
+                continue
+            # Identify smaller and larger box
+            if area_i < area_j:
+                smaller_box, larger_box, smaller_area, smaller_idx = bbox_i, bbox_j, area_i, i
+            else:
+                smaller_box, larger_box, smaller_area, smaller_idx = bbox_j, bbox_i, area_j, j
+            intersection = calculate_intersection_area(smaller_box, larger_box)
+            # If the smaller box is >80% contained in the larger one, mark it for removal
+            if (intersection / smaller_area) > 0.8:
+                indices_to_remove.add(smaller_idx)
+    # Return a new list containing only the elements that were not marked for removal
+    return [item for idx, item in enumerate(results) if idx not in indices_to_remove]
+# --- 3. Core Inference and Visualization Function ---
+@GPU
+def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)):
     if input_image is None:
         return None, "Please upload an image first."
     image_resized = input_image.resize(TARGET_SIZE)
     image_rgba = image_resized.convert("RGBA")
     def run_inference(model_to_run, model_name_desc):
         progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
         messages = [{"role": "user", "content": [{"type": "image", "image": image_rgba}, {"type": "text", "text": prompt}]}]
             json_str = json_match.group(1).strip() if json_match else raw_text.strip()
             return json.loads(json_str), raw_text
         except (json.JSONDecodeError, AttributeError):
+            return None, raw_text
     if selected_model_name == MODEL_MIXING_NAME:
         base_results, raw_text_base = run_inference(model_base, "Base Model")
         enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
         output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"
         if base_results is None or enhanced_results is None:
             return image_rgba.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"
+        progress(0.8, desc="Merging results based on IoU...")
         merged_results = list(base_results)
         base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]
         results = merged_results
     else:
         model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
         results, output_text = run_inference(model, selected_model_name)
         if results is None:
             return image_rgba.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
+    # --- NEW: Apply the final post-processing step to remove nested elements ---
+    progress(0.85, desc="Cleaning up nested elements...")
+    results = remove_nested_elements(results)
     # --- Visualization ---
+    progress(0.9, desc="Visualizing final results...")
     overlay = Image.new('RGBA', image_rgba.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
             choices=MODEL_CHOICES,
             value=MODEL_BASE_NAME,
             label="Select Model",
+            info="Choose which model to use for inference. "
         )
         prompt_textbox = gr.Textbox(
             label="Prompt",