import torch from transformers import AutoProcessor, PaliGemmaForConditionalGeneration visual_analysis = { "General Visual Analysis": "Create a detailed and cohesive analysis paragraph focusing exclusively on the visual characteristics, ensuring clarity and thorough examination.", "Form and Shape": "Provide a focused analysis that critically examines the form and shape of the object, highlighting its visual impact and structural elements.", "Symbolism and Iconography": "Explore the symbolism and iconography through an in-depth visual analysis, identifying significant symbols and their interpretative meanings.", "Composition": "Conduct a visual analysis that emphasizes the compositional elements, examining the arrangement and structural balance of the artwork.", "Light and Shadow": "Evaluate the effects of light and shadow through a detailed analysis, focusing on how these elements enhance the visual dynamics.", "Texture": "Conduct a visual analysis of texture, emphasizing the surface qualities and tactile illusions presented in the piece.", "Movement and Gesture": "Analyze the movement and gesture within the work, highlighting how these visual cues suggest motion and expression.", "Color Palette": "Examine the color palette through an exclusive visual analysis, focusing on color harmony and emotional tone.", "Line Quality": "Analyze the line quality, exploring the visual characteristics and expressiveness conveyed through line variation.", "Perspective": "Conduct a study of perspective, analyzing how depth and spatial relationships are visually represented.", "Scale and Proportion": "Evaluate the scale and proportion within the composition, analyzing how size relationships affect the visual coherence." } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_id = "/paligemma-3b-pt-224" model_path = "/GemmArte" print("Loading model...") processor = AutoProcessor.from_pretrained(model_id) model = PaliGemmaForConditionalGeneration.from_pretrained(model_path) print("Model loaded.") print("Moving model to device...") model.to(device) def generate(image, category: str, max_new_tokens=512) -> str: prompt = visual_analysis.get(category) if not prompt: # Default to general visual analysis prompt = visual_analysis["General Visual Analysis"] inputs = processor(prompt, image, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=max_new_tokens) return processor.decode(output[0], skip_special_tokens=True)[len(prompt):]