|
import gradio as gr |
|
import cv2 |
|
import numpy as np |
|
import os |
|
import json |
|
from PIL import Image |
|
import io |
|
import base64 |
|
from openai import OpenAI |
|
from ultralytics import YOLO |
|
|
|
|
|
model_path = "latex2layout_object_detection_yolov8.pt" |
|
|
|
|
|
if not os.path.exists(model_path): |
|
raise FileNotFoundError(f"Model file not found at {model_path}") |
|
|
|
|
|
try: |
|
model = YOLO(model_path) |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to load Latex2Layout model: {e}") |
|
|
|
|
|
QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" |
|
QWEN_MODELS = { |
|
"Qwen2.5-VL-3B-Instruct": "qwen2.5-vl-3b-instruct", |
|
"Qwen2.5-VL-7B-Instruct": "qwen2.5-vl-7b-instruct", |
|
"Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct", |
|
} |
|
|
|
|
|
default_system_prompt = """You are an assistant specialized in document layout analysis. |
|
The following layout elements were detected in the image (confidence >= 0.5): |
|
{layout_info} |
|
Use this information and the image to answer layout-related questions.""" |
|
|
|
def encode_image(image_array): |
|
""" |
|
Convert a numpy array image to a base64-encoded string. |
|
|
|
Args: |
|
image_array: Numpy array representing the image. |
|
|
|
Returns: |
|
str: Base64-encoded string of the image. |
|
""" |
|
try: |
|
pil_image = Image.fromarray(image_array) |
|
img_byte_arr = io.BytesIO() |
|
pil_image.save(img_byte_arr, format='PNG') |
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8") |
|
except Exception as e: |
|
raise ValueError(f"Failed to encode image: {e}") |
|
|
|
def detect_layout(image, confidence_threshold=0.5): |
|
""" |
|
Detect layout elements in the uploaded image using the Latex2Layout model. |
|
|
|
Args: |
|
image: Uploaded image as a numpy array. |
|
confidence_threshold: Minimum confidence score to retain detections (default: 0.5). |
|
|
|
Returns: |
|
tuple: (annotated_image, layout_info_str) |
|
- annotated_image: Image with bounding boxes drawn (confidence >= 0.5). |
|
- layout_info_str: JSON string of layout detections (confidence >= 0.5). |
|
""" |
|
if image is None or not isinstance(image, np.ndarray): |
|
return None, "Error: No image uploaded or invalid image format." |
|
|
|
try: |
|
|
|
results = model(image) |
|
result = results[0] |
|
annotated_image = image.copy() |
|
layout_info = [] |
|
|
|
|
|
for box in result.boxes: |
|
conf = float(box.conf[0]) |
|
if conf < confidence_threshold: |
|
continue |
|
|
|
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy()) |
|
cls_id = int(box.cls[0]) |
|
cls_name = result.names[cls_id] |
|
|
|
color = tuple(np.random.randint(0, 255, 3).tolist()) |
|
cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2) |
|
label = f"{cls_name} {conf:.2f}" |
|
(label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) |
|
cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1) |
|
cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) |
|
|
|
layout_info.append({ |
|
"bbox": [x1, y1, x2, y2], |
|
"class": cls_name, |
|
"confidence": conf |
|
}) |
|
|
|
layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5." |
|
return annotated_image, layout_info_str |
|
|
|
except Exception as e: |
|
return None, f"Error during layout detection: {str(e)}" |
|
|
|
def detect_example_image(): |
|
""" |
|
Load and detect layout elements in the example image (./image1.png). |
|
|
|
Returns: |
|
tuple: (example_image, annotated_image, layout_info_str) |
|
- example_image: Original example image. |
|
- annotated_image: Annotated example image. |
|
- layout_info_str: JSON string of layout detections. |
|
""" |
|
example_image_path = "./image1.png" |
|
if not os.path.exists(example_image_path): |
|
return None, None, "Error: Example image not found." |
|
|
|
try: |
|
|
|
bgr_image = cv2.imread(example_image_path) |
|
if bgr_image is None: |
|
return None, None, "Error: Failed to load example image." |
|
rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
annotated_image, layout_info_str = detect_layout(rgb_image) |
|
return rgb_image, annotated_image, layout_info_str |
|
except Exception as e: |
|
return None, None, f"Error processing example image: {str(e)}" |
|
|
|
def qa_about_layout(image, question, layout_info, api_key, model_name, system_prompt_template): |
|
""" |
|
Answer layout-related questions using the Qwen API with an editable system prompt. |
|
|
|
Args: |
|
image: Uploaded image as a numpy array. |
|
question: User's question about the layout. |
|
layout_info: JSON string of layout detection results. |
|
api_key: User's Qwen API key. |
|
model_name: Selected Qwen model name. |
|
system_prompt_template: Editable system prompt template. |
|
|
|
Returns: |
|
str: Qwen's response to the question. |
|
""" |
|
if image is None or not isinstance(image, np.ndarray): |
|
return "Error: Please upload a valid image." |
|
if not question: |
|
return "Error: Please enter a question." |
|
if not api_key: |
|
return "Error: Please provide a Qwen API key." |
|
if not layout_info: |
|
return "Error: No layout information available. Detect layout first." |
|
|
|
try: |
|
|
|
base64_image = encode_image(image) |
|
|
|
|
|
model_id = QWEN_MODELS.get(model_name) |
|
if not model_id: |
|
return "Error: Invalid Qwen model selected." |
|
|
|
|
|
system_prompt = system_prompt_template.replace("{layout_info}", layout_info) |
|
|
|
|
|
client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL) |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": [{"type": "text", "text": system_prompt}]}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}, |
|
{"type": "text", "text": question}, |
|
], |
|
}, |
|
] |
|
|
|
|
|
completion = client.chat.completions.create(model=model_id, messages=messages) |
|
return completion.choices[0].message.content |
|
|
|
except Exception as e: |
|
return f"Error during QA: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Latex2Layout QA System") as demo: |
|
gr.Markdown("# Latex2Layout QA System") |
|
gr.Markdown("Upload an image or use the example to detect layout elements and ask questions using Qwen models.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_image = gr.Image(label="Upload Image", type="numpy") |
|
detect_btn = gr.Button("Detect Layout") |
|
example_btn = gr.Button("Detect Example Image") |
|
gr.Markdown("**Tip**: Use clear images for best results.") |
|
|
|
with gr.Column(scale=1): |
|
output_image = gr.Image(label="Detected Layout") |
|
layout_info = gr.Textbox(label="Layout Information", lines=10, interactive=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
api_key_input = gr.Textbox( |
|
label="Qwen API Key", |
|
placeholder="Enter your Qwen API key", |
|
type="password" |
|
) |
|
model_select = gr.Dropdown( |
|
label="Select Qwen Model", |
|
choices=list(QWEN_MODELS.keys()), |
|
value="Qwen2.5-VL-3B-Instruct" |
|
) |
|
gr.Markdown("**System Prompt Template**: Edit the prompt sent to Qwen. Include `{layout_info}` to insert detection results.") |
|
system_prompt_input = gr.Textbox( |
|
label="System Prompt Template", |
|
value=default_system_prompt, |
|
lines=5, |
|
placeholder="Edit the system prompt here. Keep {layout_info} to include detection results." |
|
) |
|
question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'") |
|
qa_btn = gr.Button("Ask Question") |
|
|
|
with gr.Column(scale=1): |
|
answer_output = gr.Textbox(label="Answer", lines=5, interactive=False) |
|
|
|
|
|
detect_btn.click( |
|
fn=detect_layout, |
|
inputs=[input_image], |
|
outputs=[output_image, layout_info] |
|
) |
|
example_btn.click( |
|
fn=detect_example_image, |
|
inputs=[], |
|
outputs=[input_image, output_image, layout_info] |
|
) |
|
qa_btn.click( |
|
fn=qa_about_layout, |
|
inputs=[input_image, question_input, layout_info, api_key_input, model_select, system_prompt_input], |
|
outputs=[answer_output] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|