import subprocess import sys import torch import base64 from io import BytesIO from PIL import Image import requests from transformers import AutoModelForCausalLM, AutoProcessor from tokenizers import Tokenizer, pre_tokenizers import os def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-warn-script-location", package]) class EndpointHandler: def __init__(self, path=""): # Install necessary packages required_packages = ['timm', 'einops', 'flash-attn', 'Pillow','transformers==4.41.0.dev0'] for package in required_packages: try: install(package) print(f"Successfully installed {package}") except Exception as e: print(f"Failed to install {package}: {str(e)}") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {self.device}") # Load the model self.model_name = "arjunanand13/LADP_Florence-40e" self.model = AutoModelForCausalLM.from_pretrained( self.model_name, trust_remote_code=True ).to(self.device) # Manually load the tokenizer with a whitespace pre-tokenizer self.tokenizer = self.load_tokenizer() # Initialize the processor self.processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) if torch.cuda.is_available(): torch.cuda.empty_cache() def load_tokenizer(self): """Manually loads the tokenizer and adds a whitespace pre-tokenizer.""" try: tokenizer = Tokenizer.from_pretrained(self.model_name) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() print("[INFO] Whitespace pre-tokenizer added.") return tokenizer except Exception as e: print(f"[ERROR] Failed to load tokenizer: {str(e)}") return None def process_image(self, image_data): """Processes image data from file path or base64-encoded string.""" print("[DEBUG] Attempting to process image") try: if isinstance(image_data, str) and len(image_data) < 256 and os.path.exists(image_data): with open(image_data, 'rb') as image_file: print("[DEBUG] File opened successfully") image = Image.open(image_file) else: print("[DEBUG] Decoding base64 image data") image_bytes = base64.b64decode(image_data) image = Image.open(BytesIO(image_bytes)) print("[DEBUG] Image opened:", image.format, image.size, image.mode) return image except Exception as e: print(f"[ERROR] Error processing image: {str(e)}") return None def __call__(self, data): """Processes input and generates model output.""" try: inputs = data.pop("inputs", data) if isinstance(inputs, dict): image_path = inputs.get("image", None) text_input = inputs.get("text", "") else: image_path = inputs text_input = "What is in this image?" print("[INFO] Image path:", image_path, "| Text input:", text_input) image = self.process_image(image_path) if image_path else None model_inputs = self.processor( images=image if image else None, text=text_input, return_tensors="pt" ) model_inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items()} with torch.no_grad(): outputs = self.model.generate(**model_inputs) decoded_outputs = self.processor.batch_decode(outputs, skip_special_tokens=True) print(f"[INFO] Generated text: {decoded_outputs[0]}") return {"generated_text": decoded_outputs[0]} except Exception as e: print(f"[ERROR] {str(e)}") return {"error": str(e)}