Spaces:

ElektrikSpark
/

VLM-playground

Running

VLM-playground / src /vlm_playground /preview_app.py

trevorpfiz

fix: unexpected keyword argument 'file_name'

4aa9a45 21 days ago

28.9 kB

	import gc
	import hashlib
	import json
	import math
	import os
	import re
	from io import BytesIO
	from typing import Any, Dict, List, Optional, Tuple

	import fitz # PyMuPDF
	import gradio as gr
	import requests
	import torch
	from huggingface_hub import snapshot_download
	from PIL import Image, ImageDraw, ImageFont
	from qwen_vl_utils import process_vision_info
	from transformers import AutoModelForCausalLM, AutoProcessor

	from .utils.constants import IMAGE_FACTOR, MAX_PIXELS, MIN_PIXELS
	from .utils.prompts import dict_promptmode_to_prompt

	# ============================
	# Constants and configuration
	# ============================
	APP_TITLE = "PreviewSpace — VLM Playground"
	TMP_DIR = "/tmp/previewspace"
	MODELS_DIR = os.path.join(TMP_DIR, "models")
	DOTS_REPO_ID = "rednote-hilab/dots.ocr"
	DOTS_LOCAL_DIR = os.path.join(MODELS_DIR, "dots.ocr")

	DEFAULT_PROMPT = dict_promptmode_to_prompt.get(
	"prompt_layout_all_en",
	(
	"Please output the layout information from the PDF page image. For each element, return: "
	'bbox: [x1, y1, x2, y2], category from {"title","header","paragraph","table","figure","footnote"}, and text. '
	'Return JSON: {"elements": [{"bbox": [..], "category": "..", "text": ".."}], "page": <number>}'
	),
	)


	os.makedirs(TMP_DIR, exist_ok=True)
	os.makedirs(MODELS_DIR, exist_ok=True)


	# ===========
	# Utilities
	# ===========
	def round_by_factor(number: int, factor: int) -> int:
	return round(number / factor) * factor


	def smart_resize(
	height: int,
	width: int,
	factor: int = IMAGE_FACTOR,
	min_pixels: int = MIN_PIXELS,
	max_pixels: int = MAX_PIXELS,
	) -> Tuple[int, int]:
	if max(height, width) / min(height, width) > 200:
	raise ValueError("absolute aspect ratio must be smaller than 200")
	h_bar = max(factor, round_by_factor(height, factor))
	w_bar = max(factor, round_by_factor(width, factor))

	if h_bar * w_bar > max_pixels:
	beta = math.sqrt((height * width) / max_pixels)
	h_bar = round_by_factor(height / beta, factor)
	w_bar = round_by_factor(width / beta, factor)
	elif h_bar * w_bar < min_pixels:
	beta = math.sqrt(min_pixels / (height * width))
	h_bar = round_by_factor(height * beta, factor)
	w_bar = round_by_factor(width * beta, factor)
	return int(h_bar), int(w_bar)


	def fetch_image(
	image_input: Any,
	min_pixels: Optional[int] = None,
	max_pixels: Optional[int] = None,
	) -> Image.Image:
	if isinstance(image_input, str):
	if image_input.startswith(("http://", "https://")):
	response = requests.get(image_input, timeout=60)
	image = Image.open(BytesIO(response.content)).convert("RGB")
	else:
	image = Image.open(image_input).convert("RGB")
	elif isinstance(image_input, Image.Image):
	image = image_input.convert("RGB")
	else:
	raise ValueError(f"Invalid image input type: {type(image_input)}")

	if min_pixels is not None or max_pixels is not None:
	min_pixels = min_pixels or MIN_PIXELS
	max_pixels = max_pixels or MAX_PIXELS
	new_h, new_w = smart_resize(
	image.height,
	image.width,
	factor=IMAGE_FACTOR,
	min_pixels=min_pixels,
	max_pixels=max_pixels,
	)
	image = image.resize((new_w, new_h), Image.LANCZOS)
	return image


	def load_images_from_pdf(pdf_path: str) -> List[Image.Image]:
	images: List[Image.Image] = []
	pdf_document = fitz.open(pdf_path)
	try:
	for page_idx in range(len(pdf_document)):
	page = pdf_document.load_page(page_idx)
	pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
	img_data = pix.tobytes("ppm")
	image = Image.open(BytesIO(img_data)).convert("RGB")
	images.append(image)
	finally:
	pdf_document.close()
	return images


	def file_checksum(path: str, chunk_size: int = 1 << 20) -> str:
	hasher = hashlib.sha256()
	with open(path, "rb") as f:
	while True:
	chunk = f.read(chunk_size)
	if not chunk:
	break
	hasher.update(chunk)
	return hasher.hexdigest()


	def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.Image:
	img = image.copy()
	draw = ImageDraw.Draw(img)
	colors = {
	"Caption": "#FF6B6B",
	"Footnote": "#4ECDC4",
	"Formula": "#45B7D1",
	"List-item": "#96CEB4",
	"Page-footer": "#FFEAA7",
	"Page-header": "#DDA0DD",
	"Picture": "#FFD93D",
	"Section-header": "#6C5CE7",
	"Table": "#FD79A8",
	"Text": "#74B9FF",
	"Title": "#E17055",
	}

	try:
	try:
	font = ImageFont.truetype(
	"/System/Library/Fonts/Supplemental/Arial Bold.ttf", 12
	)
	except Exception:
	try:
	font = ImageFont.truetype(
	"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12
	)
	except Exception:
	font = ImageFont.load_default()

	for item in layout_data:
	bbox = item.get("bbox")
	category = item.get("category")
	if not bbox or not category:
	continue
	color = colors.get(category, "#000000")
	draw.rectangle(bbox, outline=color, width=2)
	label = str(category)
	label_bbox = draw.textbbox((0, 0), label, font=font)
	label_w = label_bbox[2] - label_bbox[0]
	label_h = label_bbox[3] - label_bbox[1]
	x1, y1 = int(bbox[0]), int(bbox[1])
	lx = x1
	ly = max(0, y1 - label_h - 2)
	draw.rectangle([lx, ly, lx + label_w + 4, ly + label_h + 2], fill=color)
	draw.text((lx + 2, ly + 1), label, fill="white", font=font)
	except Exception:
	pass
	return img


	def is_arabic_text(text: str) -> bool:
	if not text:
	return False
	header_pattern = r"^#{1,6}\s+(.+)$"
	paragraph_pattern = r"^(?!#{1,6}\s\|!\[\|```\|\\|\|\s[-+]\s\|\s*\d+\.\s)(.+)$"
	content_lines: List[str] = []
	for line in text.split("\n"):
	s = line.strip()
	if not s:
	continue
	m = re.match(header_pattern, s)
	if m:
	content_lines.append(m.group(1))
	continue
	if re.match(paragraph_pattern, s):
	content_lines.append(s)
	if not content_lines:
	return False
	combined = " ".join(content_lines)
	arabic = 0
	total = 0
	for ch in combined:
	if ch.isalpha():
	total += 1
	if (
	("\u0600" <= ch <= "\u06ff")
	or ("\u0750" <= ch <= "\u077f")
	or ("\u08a0" <= ch <= "\u08ff")
	):
	arabic += 1
	if total == 0:
	return False
	return (arabic / total) > 0.5


	def extract_json(text: str) -> Optional[Dict[str, Any]]:
	if not text:
	return None
	try:
	return json.loads(text)
	except Exception:
	pass
	# Try to extract JSON block
	brace_start = text.find("{")
	brace_end = text.rfind("}")
	if 0 <= brace_start < brace_end:
	snippet = text[brace_start : brace_end + 1]
	try:
	return json.loads(snippet)
	except Exception:
	pass
	fenced = re.findall(r"```json\s([\s\S]?)\s*```", text)
	for block in fenced:
	try:
	return json.loads(block)
	except Exception:
	continue
	return None


	def layoutjson2md(
	image: Image.Image, layout_data: List[Dict], text_key: str = "text"
	) -> str:
	lines: List[str] = []
	try:
	items = sorted(
	layout_data,
	key=lambda x: (
	x.get("bbox", [0, 0, 0, 0])[1],
	x.get("bbox", [0, 0, 0, 0])[0],
	),
	)
	for item in items:
	category = item.get("category", "")
	text = item.get(text_key, "")
	if category == "Title" and text:
	lines.append(f"# {text}\n")
	elif category == "Section-header" and text:
	lines.append(f"## {text}\n")
	elif category == "List-item" and text:
	lines.append(f"- {text}\n")
	elif category == "Table" and text:
	if text.strip().startswith("<"):
	lines.append(text + "\n")
	else:
	lines.append(f"Table: {text}\n")
	elif category == "Formula" and text:
	if text.strip().startswith("$") or "\\" in text:
	lines.append(f"$$\n{text}\n$$\n")
	else:
	lines.append(f"Formula: {text}\n")
	elif category == "Caption" and text:
	lines.append(f"{text}\n")
	elif category in ["Page-header", "Page-footer"]:
	continue
	elif category == "Picture":
	# Skip embedding image fragments in markdown for now
	continue
	elif text:
	lines.append(f"{text}\n")
	lines.append("")
	except Exception:
	return json.dumps(layout_data, ensure_ascii=False)
	return "\n".join(lines)


	# =====================
	# Model initialization
	# =====================
	model: Optional[AutoModelForCausalLM] = None
	processor: Optional[AutoProcessor] = None
	device = (
	"cuda"
	if torch.cuda.is_available()
	else ("mps" if torch.backends.mps.is_available() else "cpu")
	)


	def get_torch_dtype() -> torch.dtype:
	if device == "cuda":
	return torch.bfloat16
	if device == "mps":
	return torch.float16
	return torch.float32


	def ensure_model_loaded() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
	global model, processor
	if model is not None and processor is not None:
	return model, processor

	os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
	snapshot_download(
	repo_id=DOTS_REPO_ID,
	local_dir=DOTS_LOCAL_DIR,
	local_dir_use_symlinks=False,
	)

	dtype = get_torch_dtype()

	model = AutoModelForCausalLM.from_pretrained(
	DOTS_LOCAL_DIR,
	torch_dtype=dtype,
	device_map="auto",
	trust_remote_code=True,
	)
	proc = AutoProcessor.from_pretrained(DOTS_LOCAL_DIR, trust_remote_code=True)
	processor = proc
	return model, processor


	def run_inference(
	image: Image.Image, prompt_text: str, max_new_tokens: int = 24000
	) -> str:
	mdl, proc = ensure_model_loaded()
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt_text},
	],
	}
	]
	text = proc.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = proc(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
	with torch.no_grad():
	generated_ids = mdl.generate(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	do_sample=False,
	temperature=0.1,
	)
	trimmed = [
	out_ids[len(in_ids) :]
	for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
	]
	output_text = processor.batch_decode(
	trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	return output_text[0] if output_text else ""


	def process_single_image(
	image: Image.Image,
	prompt_text: str,
	min_pixels: Optional[int],
	max_pixels: Optional[int],
	max_new_tokens: int,
	) -> Dict[str, Any]:
	img = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
	raw = run_inference(img, prompt_text, max_new_tokens=max_new_tokens)
	result: Dict[str, Any] = {
	"original_image": img,
	"processed_image": img,
	"raw_output": raw,
	"layout_result": None,
	"markdown": None,
	}
	data = extract_json(raw)
	if isinstance(data, dict):
	result["layout_result"] = data
	items = data.get("elements", data.get("elements_list", data.get("content", [])))
	if isinstance(items, list):
	result["processed_image"] = draw_layout_on_image(img, items)
	result["markdown"] = layoutjson2md(img, items)
	if result["markdown"] is None:
	result["markdown"] = raw
	return result


	# =================
	# Gradio Interface
	# =================
	def create_blocks_app():
	css = """
	.main-container { max-width: 1500px; margin: 0 auto; }
	.header-text { text-align: center; color: #1f2937; margin-bottom: 12px; }
	.page-info { text-align: center; padding: 8px 16px; border-radius: 20px; font-weight: 600; }
	.process-button { border: none !important; color: white !important; font-weight: 700 !important; }
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=css, title=APP_TITLE) as demo:
	# App state
	doc_state = gr.State(
	{
	"images": [],
	"current_page": 0,
	"total_pages": 0,
	"file_type": None,
	"checksum": None,
	"results": [],
	"parsed": False,
	}
	)

	cache_state = gr.State({}) # (checksum, page, prompt_hash) -> result

	gr.HTML(
	"""
	<div class=\"header-text\">
	<h2>VLM Playground — dots.ocr</h2>
	<p>Upload a PDF or image, preview pages, and parse with a layout-extraction prompt.</p>
	</div>
	"""
	)

	with gr.Row(elem_classes=["main-container"]):
	# Left: upload + controls
	with gr.Column(scale=4):
	file_input = gr.File(
	label="Upload PDF or Image",
	file_types=[
	".pdf",
	".png",
	".jpg",
	".jpeg",
	".bmp",
	".tiff",
	".webp",
	],
	type="filepath",
	)

	with gr.Group():
	template = gr.Dropdown(
	label="Prompt Template",
	choices=["Layout Extraction"],
	value="Layout Extraction",
	)
	prompt_text = gr.Textbox(
	label="Current Prompt",
	value=DEFAULT_PROMPT,
	lines=6,
	)

	with gr.Row():
	parse_button = gr.Button(
	"Parse", variant="primary", elem_classes=["process-button"]
	)
	clear_button = gr.Button("Clear")

	with gr.Accordion("Advanced", open=False):
	max_new_tokens = gr.Slider(
	minimum=512,
	maximum=32000,
	value=24000,
	step=256,
	label="Max new tokens",
	)
	min_pixels_in = gr.Number(value=MIN_PIXELS, label="Min pixels")
	max_pixels_in = gr.Number(value=MAX_PIXELS, label="Max pixels")
	page_range = gr.Textbox(
	label="Page selection",
	placeholder="e.g., 1-3,5 (blank = current page, 'all' = all pages)",
	)

	# Center: page preview + nav
	with gr.Column(scale=5):
	preview_image = gr.Image(label="Page Preview", type="pil", height=520)
	with gr.Row():
	prev_btn = gr.Button("◀ Prev")
	page_info = gr.HTML('<div class="page-info">No file</div>')
	next_btn = gr.Button("Next ▶")
	with gr.Row():
	page_jump = gr.Number(value=1, label="Page #", precision=0)
	jump_btn = gr.Button("Go")

	# Right: results
	with gr.Column(scale=6):
	with gr.Tabs():
	with gr.Tab("Markdown Render"):
	md_render = gr.Markdown(
	value="Upload and parse to view results", height=520
	)
	with gr.Tab("Raw Markdown"):
	md_raw = gr.Textbox(value="", lines=20)
	with gr.Tab("Current Page JSON"):
	json_view = gr.JSON(value=None)
	with gr.Tab("Processed Image"):
	processed_view = gr.Image(type="pil", height=520)

	with gr.Row():
	download_jsonl = gr.DownloadButton(label="Download JSONL")
	download_markdown = gr.DownloadButton(label="Download Markdown")

	# ===== Handlers =====
	def on_template_change(choice: str) -> str:
	return DEFAULT_PROMPT

	def on_file_change(path: Optional[str]):
	if not path or not os.path.exists(path):
	return (
	{
	"images": [],
	"current_page": 0,
	"total_pages": 0,
	"file_type": None,
	"checksum": None,
	"results": [],
	"parsed": False,
	},
	None,
	'<div class="page-info">No file</div>',
	)
	checksum = file_checksum(path)
	ext = os.path.splitext(path)[1].lower()
	if ext == ".pdf":
	images = load_images_from_pdf(path)
	state = {
	"images": images,
	"current_page": 0,
	"total_pages": len(images),
	"file_type": "pdf",
	"checksum": checksum,
	"results": [None] * len(images),
	"parsed": False,
	}
	return (
	state,
	images[0] if images else None,
	f'<div class="page-info">Page 1 / {len(images)}</div>',
	)
	else:
	image = Image.open(path).convert("RGB")
	state = {
	"images": [image],
	"current_page": 0,
	"total_pages": 1,
	"file_type": "image",
	"checksum": checksum,
	"results": [None],
	"parsed": False,
	}
	return state, image, '<div class="page-info">Page 1 / 1</div>'

	def nav_page(state: Dict[str, Any], direction: str):
	if not state.get("images"):
	return (
	state,
	None,
	'<div class="page-info">No file</div>',
	"No results",
	"",
	None,
	None,
	)
	if direction == "prev":
	state["current_page"] = max(0, state["current_page"] - 1)
	elif direction == "next":
	state["current_page"] = min(
	state["total_pages"] - 1, state["current_page"] + 1
	)
	idx = state["current_page"]
	img = state["images"][idx]
	info = (
	f'<div class="page-info">Page {idx + 1} / {state["total_pages"]}</div>'
	)
	result = (
	state["results"][idx]
	if state.get("parsed") and idx < len(state["results"])
	else None
	)
	md = result.get("markdown") if result else "Page not processed yet"
	md_out = gr.update(value=md, rtl=True) if is_arabic_text(md) else md
	md_raw_text = md
	proc_img = result.get("processed_image") if result else None
	js = result.get("layout_result") if result else None
	return state, img, info, md_out, md_raw_text, proc_img, js

	def jump_to_page(state: Dict[str, Any], page_num: Any):
	if not state.get("images"):
	return (
	state,
	None,
	'<div class="page-info">No file</div>',
	"No results",
	"",
	None,
	None,
	)
	try:
	n = int(page_num)
	except Exception:
	n = 1
	n = max(1, min(state["total_pages"], n))
	state["current_page"] = n - 1
	return nav_page(state, direction="stay")

	def parse_pages(
	state: Dict[str, Any],
	prompt: str,
	max_tokens: int,
	min_pix: Optional[float],
	max_pix: Optional[float],
	selection: Optional[str],
	):
	if not state.get("images"):
	return state, None, "No file", "No content", "", None, None

	# Determine pages to process
	indices: List[int] = []
	if not selection or selection.strip() == "":
	indices = [state["current_page"]]
	elif selection.strip().lower() == "all":
	indices = list(range(state["total_pages"]))
	else:
	# parse like 1-3,5
	parts = [p.strip() for p in selection.split(",") if p.strip()]
	for p in parts:
	if "-" in p:
	a, b = p.split("-", 1)
	try:
	a_i = max(1, int(a))
	b_i = min(state["total_pages"], int(b))
	for i in range(a_i - 1, b_i):
	indices.append(i)
	except Exception:
	continue
	else:
	try:
	i = max(1, min(state["total_pages"], int(p)))
	indices.append(i - 1)
	except Exception:
	continue
	indices = sorted(
	set([i for i in indices if 0 <= i < state["total_pages"]])
	)

	# Process sequentially for stability
	results = state.get("results") or [None] * state["total_pages"]
	for i in indices:
	img = state["images"][i]
	prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:16]
	cache_key = (
	state["checksum"],
	i,
	prompt_hash,
	int(min_pix or 0),
	int(max_pix or 0),
	int(max_tokens),
	)
	cached = cache_state.value.get(cache_key)
	if cached:
	results[i] = cached
	continue
	res = process_single_image(
	img,
	prompt_text=prompt,
	min_pixels=int(min_pix) if min_pix else None,
	max_pixels=int(max_pix) if max_pix else None,
	max_new_tokens=int(max_tokens),
	)
	results[i] = res
	cache_state.value[cache_key] = res
	state["results"] = results
	state["parsed"] = True

	# Return current page outputs
	idx = state["current_page"]
	curr = results[idx]
	md = curr.get("markdown") if curr else "No content"
	md_out = gr.update(value=md, rtl=True) if is_arabic_text(md) else md
	md_raw_text = md
	proc_img = curr.get("processed_image") if curr else None
	js = curr.get("layout_result") if curr else None
	info = (
	f'<div class="page-info">Page {idx + 1} / {state["total_pages"]}</div>'
	)
	prev = state["images"][idx]
	return state, prev, info, md_out, md_raw_text, proc_img, js

	def clear_all():
	gc.collect()
	return (
	{
	"images": [],
	"current_page": 0,
	"total_pages": 0,
	"file_type": None,
	"checksum": None,
	"results": [],
	"parsed": False,
	},
	None,
	'<div class="page-info">No file</div>',
	"Upload and parse to view results",
	"",
	None,
	None,
	)

	def download_current_jsonl(state: Dict[str, Any]):
	if not state.get("parsed"):
	return gr.DownloadButton.update(value=b"")
	lines: List[str] = []
	for i, res in enumerate(state.get("results", [])):
	if res and res.get("layout_result") is not None:
	obj = {"page": i + 1, "layout": res["layout_result"]}
	lines.append(json.dumps(obj, ensure_ascii=False))
	content = "\n".join(lines) if lines else ""
	out_path = os.path.join(TMP_DIR, "results.jsonl")
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(content)
	return gr.DownloadButton.update(value=out_path)

	def download_current_markdown(state: Dict[str, Any]):
	if not state.get("parsed"):
	return gr.DownloadButton.update(value=b"")
	chunks: List[str] = []
	for i, res in enumerate(state.get("results", [])):
	if res and res.get("markdown"):
	chunks.append(f"## Page {i + 1}\n\n{res['markdown']}")
	content = "\n\n---\n\n".join(chunks) if chunks else ""
	out_path = os.path.join(TMP_DIR, "results.md")
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(content)
	return gr.DownloadButton.update(value=out_path)

	# Wire events
	template.change(on_template_change, inputs=[template], outputs=[prompt_text])
	file_input.change(
	on_file_change,
	inputs=[file_input],
	outputs=[doc_state, preview_image, page_info],
	)
	prev_btn.click(
	lambda s: nav_page(s, "prev"),
	inputs=[doc_state],
	outputs=[
	doc_state,
	preview_image,
	page_info,
	md_render,
	md_raw,
	processed_view,
	json_view,
	],
	)
	next_btn.click(
	lambda s: nav_page(s, "next"),
	inputs=[doc_state],
	outputs=[
	doc_state,
	preview_image,
	page_info,
	md_render,
	md_raw,
	processed_view,
	json_view,
	],
	)
	jump_btn.click(
	jump_to_page,
	inputs=[doc_state, page_jump],
	outputs=[
	doc_state,
	preview_image,
	page_info,
	md_render,
	md_raw,
	processed_view,
	json_view,
	],
	)
	parse_button.click(
	parse_pages,
	inputs=[
	doc_state,
	prompt_text,
	max_new_tokens,
	min_pixels_in,
	max_pixels_in,
	page_range,
	],
	outputs=[
	doc_state,
	preview_image,
	page_info,
	md_render,
	md_raw,
	processed_view,
	json_view,
	],
	)
	clear_button.click(
	clear_all,
	outputs=[
	doc_state,
	preview_image,
	page_info,
	md_render,
	md_raw,
	processed_view,
	json_view,
	],
	)

	download_jsonl.click(
	download_current_jsonl, inputs=[doc_state], outputs=[download_jsonl]
	)
	download_markdown.click(
	download_current_markdown, inputs=[doc_state], outputs=[download_markdown]
	)

	return demo