Spaces:

ChaseHan
/

Latex2Layout-Qwen2.5VL

Running on Zero

App Files Files Community

ChaseHan commited on Jul 15

Commit

12c272a

verified ·

1 Parent(s): 7c41462

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -63

app.py CHANGED Viewed

@@ -44,7 +44,6 @@ DEFAULT_PROMPT = (
 print("Loading models, this will take some time and VRAM...")
 try:
     # WARNING: Loading two 3B models without quantization requires a large amount of VRAM (>12 GB).
-    # This may fail on hardware with insufficient memory.
     print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
@@ -59,7 +58,6 @@ try:
         device_map="auto"
     )
-    # Processor is the same for both models
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
     print("All models loaded successfully!")
 except Exception as e:
@@ -68,7 +66,7 @@ except Exception as e:
 # --- 3. Core Inference and Visualization Function ---
 @GPU
-def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
@@ -76,39 +74,31 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         return None, "Please upload an image first."
     # Select the model based on user's choice
-    if selected_model_name == MODEL_BASE_NAME:
-        model = model_base
-    else:
-        model = model_enhanced
     progress(0, desc=f"Resizing image for {selected_model_name}...")
-    image = input_image.resize(TARGET_SIZE)
-    image = image.convert("RGBA")
-    messages = [
-        {"role": "user", "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": prompt}
-        ]}
-    ]
     progress(0.2, desc="Preparing model inputs...")
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
-        # Use greedy search (do_sample=False) for faster, deterministic output
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=4096,
-            do_sample=False
-        )
-    output_text = processor.batch_decode(
-        output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
-    )[0]
     progress(0.8, desc="Parsing and visualizing results...")
     try:
@@ -127,41 +117,34 @@ def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name:
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
-        bbox = item.get("bbox_2d")
-        label = item.get("label", "other")
-        order = item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
         solid_color_rgb = fill_color_rgba[:3]
         draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH)
         tag_text = f"{order}: {label}"
         tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
         tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
         tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
-    visualized_image = Image.alpha_composite(image, overlay).convert("RGB")
-    return visualized_image, output_text
 def clear_outputs():
-    """Helper function to clear the output fields."""
     return None, None
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
-    gr.Markdown(
-        "Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model fine-tuned on our Latex2Layout annotated layout dataset to identify layout regions in academic papers. "
-        "Upload a document image to begin."
-        "\n> **Please note:** All uploaded images are automatically resized to 924x1204 pixels to meet the model's input requirements."
-    )
     gr.Markdown("<hr>")
     with gr.Row():
@@ -173,40 +156,38 @@ with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection")
     with gr.Row():
          analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
-    # --- Advanced Settings Panel ---
     with gr.Accordion("Advanced Settings", open=False):
-        model_selector = gr.Radio(
-            choices=MODEL_CHOICES,
-            value=MODEL_BASE_NAME,
-            label="Select Model",
-            info="Choose which model to use for inference."
-        )
-        prompt_textbox = gr.Textbox(
-            label="Prompt",
-            value=DEFAULT_PROMPT,
-            lines=5,
-            info="The prompt used to instruct the model."
-        )
     output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
-    gr.Examples(
-        examples=[["1.png"], ["2.png"], ["12.png"], ["13.png"], ["14.png"], ["11.png"], ["3.png"], ["7.png"], ["8.png"]],
-        inputs=[input_image],
-        label="Examples (Click to Run)",
-    )
-    gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset generated by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
-        inputs=[input_image, model_selector, prompt_textbox],
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
 # --- 5. Launch the Application ---
 if __name__ == "__main__":

 print("Loading models, this will take some time and VRAM...")
 try:
     # WARNING: Loading two 3B models without quantization requires a large amount of VRAM (>12 GB).
     print(f"Loading {MODEL_BASE_NAME}...")
     model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_BASE_ID,
         device_map="auto"
     )
     processor = AutoProcessor.from_pretrained(MODEL_BASE_ID)
     print("All models loaded successfully!")
 except Exception as e:
 # --- 3. Core Inference and Visualization Function ---
 @GPU
+def analyze_and_visualize_layout(input_image: Image.Image, selected_model_name: str, prompt: str, use_greedy: bool, temperature: float, top_p: float, progress=gr.Progress(track_tqdm=True)):
     """
     Takes an image and model parameters, runs inference, and returns a visualized image and raw text output.
     """
         return None, "Please upload an image first."
     # Select the model based on user's choice
+    model = model_base if selected_model_name == MODEL_BASE_NAME else model_enhanced
     progress(0, desc=f"Resizing image for {selected_model_name}...")
+    image = input_image.resize(TARGET_SIZE).convert("RGBA")
+    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
     progress(0.2, desc="Preparing model inputs...")
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(model.device)
+    # Dynamically build generation arguments based on user's choice
+    gen_kwargs = {"max_new_tokens": 4096}
+    if use_greedy:
+        gen_kwargs["do_sample"] = False
+    else:
+        gen_kwargs["do_sample"] = True
+        gen_kwargs["temperature"] = temperature
+        gen_kwargs["top_p"] = top_p
     progress(0.5, desc="Generating layout data...")
     with torch.no_grad():
+        output_ids = model.generate(**inputs, **gen_kwargs)
+    output_text = processor.batch_decode(output_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
     progress(0.8, desc="Parsing and visualizing results...")
     try:
         font = ImageFont.load_default()
     for item in sorted(results, key=lambda x: x.get("order", 999)):
+        bbox, label, order = item.get("bbox_2d"), item.get("label", "other"), item.get("order", "")
         if not bbox or len(bbox) != 4: continue
         fill_color_rgba = LABEL_COLORS.get(label, LABEL_COLORS["other"])
         solid_color_rgb = fill_color_rgba[:3]
         draw.rectangle(bbox, fill=fill_color_rgba, outline=solid_color_rgb, width=OUTLINE_WIDTH)
         tag_text = f"{order}: {label}"
         tag_bbox = draw.textbbox((0, 0), tag_text, font=font)
         tag_w, tag_h = tag_bbox[2] - tag_bbox[0], tag_bbox[3] - tag_bbox[1]
         tag_bg_box = [bbox[0], bbox[1], bbox[0] + tag_w + 10, bbox[1] + tag_h + 6]
         draw.rectangle(tag_bg_box, fill=solid_color_rgb)
         draw.text((bbox[0] + 5, bbox[1] + 3), tag_text, font=font, fill="white")
+    return Image.alpha_composite(image, overlay).convert("RGB"), output_text
 def clear_outputs():
     return None, None
+def toggle_sampling_params(use_greedy):
+    """Updates visibility of temperature and top-p sliders."""
+    is_visible = not use_greedy
+    return gr.update(visible=is_visible), gr.update(visible=is_visible)
 # --- 4. Gradio User Interface ---
 with gr.Blocks(theme=gr.themes.Glass(), title="Academic Paper Layout Detection") as demo:
     gr.Markdown("# 📄 Academic Paper Layout Detection")
+    gr.Markdown("Welcome! This tool uses a Qwen2.5-VL-3B-Instruct model...") # Truncated for brevity
     gr.Markdown("<hr>")
     with gr.Row():
     with gr.Row():
          analyze_btn = gr.Button("✨ Analyze Layout", variant="primary", scale=1)
     with gr.Accordion("Advanced Settings", open=False):
+        model_selector = gr.Radio(choices=MODEL_CHOICES, value=MODEL_BASE_NAME, label="Select Model")
+        prompt_textbox = gr.Textbox(label="Prompt", value=DEFAULT_PROMPT, lines=5)
+        # NEW: Checkbox to toggle between greedy and sampling
+        greedy_checkbox = gr.Checkbox(label="Use Greedy Decoding", value=True, info="Faster and deterministic. Uncheck to enable Temperature and Top-p.")
+        # NEW: Sliders are initially hidden
+        with gr.Row(visible=False) as sampling_params:
+            temp_slider = gr.Slider(minimum=0.0, maximum=2.0, step=0.05, value=0.7, label="Temperature")
+            top_p_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.9, label="Top-p")
     output_text = gr.Textbox(label="Model Raw Output", lines=8, interactive=False, visible=True)
+    gr.Examples(examples=[["1.png"], ["2.png"], ["10.png"]], inputs=[input_image], label="Examples (Click to Run)")
+    gr.Markdown("<p style='text-align:center; color:grey;'>Powered by the Latex2Layout dataset by Feijiang Han</p>")
     # --- Event Handlers ---
     analyze_btn.click(
         fn=analyze_and_visualize_layout,
+        inputs=[input_image, model_selector, prompt_textbox, greedy_checkbox, temp_slider, top_p_slider],
         outputs=[output_image, output_text]
     )
     input_image.upload(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
     input_image.clear(fn=clear_outputs, inputs=None, outputs=[output_image, output_text])
+    # NEW: Event handler to show/hide sliders
+    greedy_checkbox.change(
+        fn=toggle_sampling_params,
+        inputs=greedy_checkbox,
+        outputs=[sampling_params]
+    )
 # --- 5. Launch the Application ---
 if __name__ == "__main__":