Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Runtime error

App Files Files Community

ChaseHan commited on Jul 15

Commit

de9cba4

verified ·

1 Parent(s): 56e25bb

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -89

app.py CHANGED Viewed

@@ -5,30 +5,28 @@ from PIL import Image, ImageDraw, ImageFont
 import json
 import re
 from spaces import GPU
-from peft import PeftModel
 # --- 1. Configurations and Constants ---
 # Define user-facing names and Hugging Face IDs for the models
 MODEL_BASE_NAME = "Latex2Layout-Base"
 MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"
-MODEL_ENHANCED_NAME = "Qwen2.5-VL + GRPO LoRA (Merged)"
-MODEL_ENHANCED_BASE_ID = "ZelongWang/Qwen2.5-VL-3B-Instruct-DocOD-2"
-MODEL_ENHANCED_LORA_ID = "ZelongWang/Qwen2.5-VL-3B-GRPO-lora-pdf-v3"
-LORA_CHECKPOINT_FOLDER = "checkpoint-525"  # Subfolder containing the adapter
 # --- NEW: Add a name for the Mixing mode ---
 MODEL_MIXING_NAME = "Mixing (Base + Enhanced)"
 MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME, MODEL_MIXING_NAME]
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
 # Visualization Style Constants
 OUTLINE_WIDTH = 3
 LABEL_COLORS = {
-    "title": (255, 82, 82, 90),         # Red
     "abstract": (46, 204, 113, 90),     # Green
     "heading": (52, 152, 219, 90),      # Blue
     "footnote": (241, 196, 15, 90),     # Yellow
@@ -48,46 +46,30 @@ DEFAULT_PROMPT = (
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
-    # Load the original base model
-    print(f"Loading base model: {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         torch_dtype=torch.float16,
         device_map="auto"
     )
-    # Load and merge the new enhanced model directly from the Hub
-    print(f"Loading enhanced model base: {MODEL_ENHANCED_BASE_ID}...")
-    # Step 1: Load the new base model
     model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ENHANCED_BASE_ID,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-    )
-    print(f"Loading LoRA adapter online from: {MODEL_ENHANCED_LORA_ID}...")
-    # Step 2: Load Peft adapter directly from the Hub, specifying the subfolder
-    model_enhanced = PeftModel.from_pretrained(
-        model_enhanced,
-        MODEL_ENHANCED_LORA_ID,
-        subfolder=LORA_CHECKPOINT_FOLDER,
         device_map="auto"
     )
-    # Step 3: Merge the adapter weights and unload the PeftModel
-    print("Merging LoRA adapter...")
-    model_enhanced = model_enhanced.merge_and_unload()
-    print(f"Successfully loaded and merged model: {MODEL_ENHANCED_NAME}")
-    # Load processor
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
-    print("All models and processor loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
-# --- 3. Core Inference, Merging, and Visualization ---
 def calculate_iou(boxA, boxB):
     """Calculate Intersection over Union (IoU) of two bounding boxes."""
     # Determine the coordinates of the intersection rectangle
@@ -109,8 +91,9 @@ def calculate_iou(boxA, boxB):
     # Return the IoU
     return interArea / unionArea if unionArea > 0 else 0
 @GPU
-def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, use_greedy: bool, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     Supports running a single model or mixing results from two models.
@@ -119,50 +102,40 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         return None, "Please upload an image first."
     progress(0, desc="Resizing image...")
-    image = input_image.resize(TARGET_SIZE).convert("RGBA")
-    # --- Nested function to run inference on a given model ---
     def run_inference(model_to_run, model_name_desc):
         progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
-        messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model_to_run.device)
-        gen_kwargs = {"max_new_tokens": 4096}
-        if use_greedy:
-            gen_kwargs["do_sample"] = False
-        else:
-            gen_kwargs["do_sample"] = True
-            gen_kwargs["temperature"] = temperature
-            gen_kwargs["top_p"] = top_p
         progress(0.5, desc=f"Generating layout data with {model_name_desc}...")
         with torch.no_grad():
-            output_ids = model_to_run.generate(**inputs, **gen_kwargs)
         raw_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
         try:
             json_match = re.search(r"```json(.*?)```", raw_text, re.DOTALL)
             json_str = json_match.group(1).strip() if json_match else raw_text.strip()
-            parsed_results = json.loads(json_str)
-            return parsed_results, raw_text
         except (json.JSONDecodeError, AttributeError):
-            # Return raw text on failure for debugging
-            return None, raw_text
     # --- Main logic: single model or mixing ---
     if selected_model_name == MODEL_MIXING_NAME:
-        # Run both models
         base_results, raw_text_base = run_inference(model_base, "Base Model")
         enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
         output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"
         if base_results is None or enhanced_results is None:
-            return image.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"
-        # Merge results
         progress(0.8, desc="Merging results from both models...")
         merged_results = list(base_results)
         base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]
@@ -172,8 +145,7 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
             is_duplicate = False
             for base_bbox in base_bboxes:
-                iou = calculate_iou(enhanced_item['bbox_2d'], base_bbox)
-                if iou > 0.5:  # IoU threshold for duplication
                     is_duplicate = True
                     break
@@ -181,17 +153,16 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
                 merged_results.append(enhanced_item)
         results = merged_results
     else:
         # Run a single model
         model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
         results, output_text = run_inference(model, selected_model_name)
         if results is None:
-            return image.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
     # --- Visualization ---
-    progress(0.9, desc="Visualizing final results...")
-    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
     try:
@@ -200,9 +171,7 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
-        bbox = item.get("bbox_2d")
-        label = item.get("label", "other")
-        order = item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
@@ -216,20 +185,23 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
-    return Image.alpha_composite(image, overlay).convert("RGB"), output_text
 def clear_outputs():
     return None, None
-def toggle_sampling_params(use_greedy):
-    """Updates visibility of temperature and top-p sliders."""
-    is_visible = not use_greedy
-    return gr.update(visible=is_visible)
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
-    gr.Markdown("Welcome! This tool uses Qwen2.5-VL models to detect layout components in academic papers. You can choose the **Latex2Layout** model, an **Enhanced** version, or **Mix** the results of both.")
     gr.Markdown("<hr>")
     with gr.Row():
@@ -241,38 +213,40 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     with gr.Row():
           analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
     with gr.Accordion("Advanced Settings", open=False):
         model_selector = gr.Radio(
-            choices=MODEL_CHOICES,
-            value=MODEL_MIXING_NAME, # Default to the new mixing mode
-            label="Select Model"
         )
-        prompt_textbox = gr.Textbox(label="Prompt", value=DEFAULT_PROMPT, lines=5)
-        greedy_checkbox = gr.Checkbox(label="Use Greedy Decoding", value=True, info="Faster and deterministic. Uncheck to enable Temperature and Top-p.")
-        temp_slider = gr.Slider(minimum=0.0, maximum=2.0, step=0.05, value=0.7, label="Temperature")
-        top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.9, label="Top-p")
-    output_text = gr.Textbox(label="Model Raw Output", lines=10, interactive=False, visible=True)
-    gr.Examples(examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]], inputs=[input_image], label="Examples (Click to Run)")
-    gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
-        inputs=[input_image, model_selector, prompt_textbox, greedy_checkbox, temp_slider, top_p_slider],
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
-    greedy_checkbox.change(
-        fn=toggle_sampling_params,
-        inputs=greedy_checkbox,
-        outputs=[sampling_params]
-    )
 # --- 5. Launch the Application ---
 if __name__ == "__main__":

 import json
 import re
 from spaces import GPU
 # --- 1. Configurations and Constants ---
 # Define user-facing names and Hugging Face IDs for the models
 MODEL_BASE_NAME = "Latex2Layout-Base"
 MODEL_BASE_ID = "ChaseHan/Latex2Layout-2000-sync"
+MODEL_ENHANCED_NAME = "Latex2Layout-Enhanced"
+MODEL_ENHANCED_ID = "ChaseHan/Latex2Layout-RL"
 # --- NEW: Add a name for the Mixing mode ---
 MODEL_MIXING_NAME = "Mixing (Base + Enhanced)"
 MODEL_CHOICES = [MODEL_BASE_NAME, MODEL_ENHANCED_NAME, MODEL_MIXING_NAME]
 # Target image size for model input
 TARGET_SIZE = (924, 1204)
 # Visualization Style Constants
 OUTLINE_WIDTH = 3
+# Color mapping for different layout regions (RGBA for transparency)
 LABEL_COLORS = {
+    "title": (255, 82, 82, 90),          # Red
     "abstract": (46, 204, 113, 90),     # Green
     "heading": (52, 152, 219, 90),      # Blue
     "footnote": (241, 196, 15, 90),     # Yellow
 # --- 2. Load Models and Processor ---
 print("Loading models, this will take some time and VRAM...")
 try:
+    # WARNING: Loading two 3B models without quantization requires a large amount of VRAM (>12 GB).
+    # This may fail on hardware with insufficient memory.
+    print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         torch_dtype=torch.float16,
         device_map="auto"
     )
+    print(f"Loading {MODEL_ENHANCED_NAME}...")
     model_enhanced = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ENHANCED_ID,
+        torch_dtype=torch.float16,
         device_map="auto"
     )
+    # Processor is the same for both models
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
+    print("All models loaded successfully!")
 except Exception as e:
     print(f"Error loading models: {e}")
     exit()
+# --- NEW: Helper function to calculate Intersection over Union ---
 def calculate_iou(boxA, boxB):
     """Calculate Intersection over Union (IoU) of two bounding boxes."""
     # Determine the coordinates of the intersection rectangle
     # Return the IoU
     return interArea / unionArea if unionArea > 0 else 0
+# --- 3. Core Inference and Visualization Function (MODIFIED) ---
 @GPU
+def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     Supports running a single model or mixing results from two models.
         return None, "Please upload an image first."
     progress(0, desc="Resizing image...")
+    image_resized = input_image.resize(TARGET_SIZE)
+    image_rgba = image_resized.convert("RGBA")
+    # --- Nested helper function to run inference on a given model ---
     def run_inference(model_to_run, model_name_desc):
         progress(0.1, desc=f"Preparing inputs for {model_name_desc}...")
+        messages = [{"role": "user", "content": [{"type": "image", "image": image_rgba}, {"type": "text", "text": prompt}]}]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text], images=[image_rgba], padding=True, return_tensors="pt").to(model_to_run.device)
         progress(0.5, desc=f"Generating layout data with {model_name_desc}...")
         with torch.no_grad():
+            output_ids = model_to_run.generate(**inputs, max_new_tokens=4096, do_sample=False)
         raw_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
         try:
             json_match = re.search(r"```json(.*?)```", raw_text, re.DOTALL)
             json_str = json_match.group(1).strip() if json_match else raw_text.strip()
+            return json.loads(json_str), raw_text
         except (json.JSONDecodeError, AttributeError):
+            return None, raw_text # Return raw text on failure for debugging
     # --- Main logic: single model or mixing ---
     if selected_model_name == MODEL_MIXING_NAME:
         base_results, raw_text_base = run_inference(model_base, "Base Model")
         enhanced_results, raw_text_enhanced = run_inference(model_enhanced, "Enhanced Model")
         output_text = f"--- Base Model Output ---\n{raw_text_base}\n\n--- Enhanced Model Output ---\n{raw_text_enhanced}"
         if base_results is None or enhanced_results is None:
+            return image_rgba.convert("RGB"), f"Failed to parse JSON from one or both models:\n\n{output_text}"
+        # Merge results based on IoU
         progress(0.8, desc="Merging results from both models...")
         merged_results = list(base_results)
         base_bboxes = [item['bbox_2d'] for item in base_results if 'bbox_2d' in item]
             is_duplicate = False
             for base_bbox in base_bboxes:
+                if calculate_iou(enhanced_item['bbox_2d'], base_bbox) > 0.5:
                     is_duplicate = True
                     break
                 merged_results.append(enhanced_item)
         results = merged_results
     else:
         # Run a single model
         model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
         results, output_text = run_inference(model, selected_model_name)
         if results is None:
+            return image_rgba.convert("RGB"), f"Failed to parse JSON from model output:\n\n{output_text}"
     # --- Visualization ---
+    progress(0.9, desc="Parsing and visualizing final results...")
+    overlay = Image.new('RGBA', image_rgba.size, (255, 255, 255, 0))
     draw = ImageDraw.Draw(overlay)
     try:
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
+        bbox, label, order = item.get("bbox_2d"), item.get("label", "other"), item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
+    visualized_image = Image.alpha_composite(image_rgba, overlay).convert("RGB")
+    return visualized_image, output_text
 def clear_outputs():
+    """Helper function to clear the output fields."""
     return None, None
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
+    gr.Markdown(
+        "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
+        "Upload a document image to begin."
+        "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
+    )
     gr.Markdown("<hr>")
     with gr.Row():
     with gr.Row():
           analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
+    # --- Advanced Settings Panel ---
     with gr.Accordion("Advanced Settings", open=False):
         model_selector = gr.Radio(
+            choices=MODEL_CHOICES,
+            value=MODEL_BASE_NAME,
+            label="Select Model",
+            info="Choose which model to use for inference. 'Mixing' combines the results of both."
+        )
+        prompt_textbox = gr.Textbox(
+            label="Prompt",
+            value=DEFAULT_PROMPT,
+            lines=5,
+            info="The prompt used to instruct the model."
         )
+    output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
+    gr.Examples(
+        examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]],
+        inputs=[input_image],
+        label="Examples (Click to Run)",
+    )
+    gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
+        inputs=[input_image, model_selector, prompt_textbox],
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
 # --- 5. Launch the Application ---
 if __name__ == "__main__":