wealthcoders
/

qwen3-vl-2B

Image-Text-to-Text

Model card Files Files and versions

qwen3-vl-2B / handler.py

wealthcoders's picture

Update handler.py

f2d4b69 verified 3 months ago

history blame contribute delete

3.19 kB

	from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
	from typing import Dict, List, Any
	import torch
	import io
	from PIL import Image
	import base64
	import time
	import uuid

	prompt = """Task:
	Analyze this document image exhaustively and output in Markdown format.
	Rules:
	- Do not add any comments, provide content only;
	- Extract ALL visible text exactly as written;
	- Preserve possible additional languages;
	- Maintain line breaks, indentation, and spacing;
	- Never translate non-English text.
	- Do not add unnecessary or additional information. Do not add any links or images. Do not add Chinese symbols.
	Important: the output format must be Markdown (use bold text, headlines, so on)."""

	class EndpointHandler:
	def __init__(self, path: str = "unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit"):
	# Load tokenizer and model
	self.processor = AutoProcessor.from_pretrained(path)
	self.model = Qwen3VLForConditionalGeneration.from_pretrained(path, device_map="auto")
	self.model.eval()

	def __call__(self, data: Dict[str, Any]) -> str:
	# Prepare your messages with image and text
	inputs = data.get("inputs")
	base64image = inputs["base64"]

	img_bytes = base64.b64decode(base64image)
	pil_img = Image.open(io.BytesIO(img_bytes)).convert("RGB")

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": pil_img}, # pass PIL image directly
	{"type": "text", "text": prompt},
	]
	}
	]

	# Process the input and generate a response
	inputs = self.processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(self.model.device)

	generated_ids = self.model.generate(**inputs, max_new_tokens=2048)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = self.processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	response = {
	"id": f"chatcmpl-{uuid.uuid4().hex}",
	"object": "chat.completion",
	"created": int(time.time()),
	"model": "Qwen/Qwen3-VL-8B-Instruct",
	"usage": {
	# you might compute these if you can get token counts
	"prompt_tokens": None,
	"completion_tokens": None,
	"total_tokens": None
	},
	"choices": [
	{
	"message": {
	"role": "assistant",
	"content": output_text[0]
	},
	"finish_reason": "stop",
	"index": 0
	}
	]
	}

	return response