my-smoldocling-demo

Running

App Files Files Community

my-smoldocling-demo / app.py

bharatcoder

Update app.py

fcf0972 verified 10 days ago

raw

history blame contribute delete

3.71 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from PIL import Image

	# Load model & processor once at startup
	processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
	model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

	def smoldocling_readimage(image, prompt_text):
	"""
	Extract text and structured content from document images using SmolDocling model.

	This function processes document images (PDFs, scanned documents, screenshots, etc.)
	and converts them to structured text format based on the provided prompt. It uses
	the SmolDocling-256M-preview model for image-to-text conversion with chat-based
	prompting.

	Args:
	image (PIL.Image.Image): The input document image to process. Should be a PIL
	Image object containing a document, text, or any visual content that needs
	to be converted to text.
	prompt_text (str): The instruction or prompt text that guides the model's
	output format. Supported prompts include:

	Content Conversion:
	- "Convert this page to docling." - Full conversion to DocTags representation
	- "Convert chart to table." - Convert charts to table format
	- "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
	- "Convert code to text." - Convert code blocks to readable text
	- "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)

	OCR and Location-based Actions:
	- "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
	- "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
	- "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
	- "Detect footer elements on the page." - Identify footer content

	Returns:
	str: The extracted and formatted text content from the image, cleaned of
	special tokens and whitespace. The format depends on the prompt_text
	provided.

	Example:
	>>> from PIL import Image
	>>> img = Image.open("document.pdf")
	>>> result = smoldocling_readimage(img, "Convert to docling")
	>>> print(result) # Returns structured document content

	Note:
	- The function is optimized for document images but can handle any image
	containing text
	- Processing time depends on image size and complexity
	- Maximum output length is limited to 1024 new tokens
	"""
	messages = [
	{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=1024)
	prompt_length = inputs.input_ids.shape[1]
	generated = outputs[:, prompt_length:]
	result = processor.batch_decode(generated, skip_special_tokens=False)[0]
	return result.replace("<end_of_utterance>", "").strip()

	# Gradio UI
	demo = gr.Interface(
	fn=smoldocling_readimage,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
	],
	outputs="text",
	title="SmolDocling Web App",
	description="Upload a document image and convert it to structured docling format."
	)

	demo.launch(mcp_server=True)