import torch from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification from PIL import Image import numpy as np # Initialize the model and processor with caching processor = None model = None def get_document_ai_models(): """Get or initialize document AI models with proper caching.""" global processor, model if processor is None: processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") if model is None: model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") return processor, model def extract_text_and_layout(image): """ Extract text and layout information using LayoutLMv2. Args: image: PIL Image object Returns: Dictionary with extracted text and layout information """ # Get models with lazy loading processor, model = get_document_ai_models() # Convert numpy array to PIL Image if needed if isinstance(image, np.ndarray): image = Image.fromarray(image).convert("RGB") # Prepare inputs for the model encoding = processor(image, return_tensors="pt") # Get the input_ids (tokenized text) input_ids = encoding.input_ids # Get words from input_ids tokens = processor.tokenizer.convert_ids_to_tokens(input_ids[0]) words = processor.tokenizer.convert_tokens_to_string(tokens).split() # Get bounding boxes bbox = encoding.bbox[0] return { 'words': words, 'boxes': bbox.tolist(), 'encoding': encoding, # Keep for future processing }