import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextStreamer import torch import gc import os # Enable better CPU performance torch.set_num_threads(4) device = "cpu" def load_model(): model_name = "forestav/unsloth_vision_radiography_finetune" base_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct" # Correct base model print("Loading tokenizer and processor...") # Load tokenizer from base model tokenizer = AutoTokenizer.from_pretrained( base_model_name, trust_remote_code=True ) # Load processor from base model processor = AutoProcessor.from_pretrained( base_model_name, trust_remote_code=True ) print("Loading model...") # Load model with CPU optimizations model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True, offload_folder="offload", offload_state_dict=True, trust_remote_code=True ) print("Quantizing model...") model = torch.quantization.quantize_dynamic( model, {torch.nn.Linear}, dtype=torch.qint8 ) return model, tokenizer, processor # Create offload directory if it doesn't exist os.makedirs("offload", exist_ok=True) # Initialize model and tokenizer globally print("Starting model initialization...") try: model, tokenizer, processor = load_model() print("Model loaded and quantized successfully!") except Exception as e: print(f"Error loading model: {str(e)}") raise def analyze_image(image, instruction): try: # Clear memory gc.collect() if instruction.strip() == "": instruction = "You are an expert radiographer. Describe accurately what you see in this image." # Prepare the messages messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] # Process the image and text inputs = processor( images=image, text=tokenizer.apply_chat_template(messages, add_generation_prompt=True), return_tensors="pt" ) # Generate with conservative settings for CPU with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=128, temperature=1.0, min_p=0.1, use_cache=True, pad_token_id=tokenizer.eos_token_id, num_beams=1 ) # Decode the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Clean up del outputs gc.collect() return response except Exception as e: return f"Error processing image: {str(e)}\nPlease try again with a smaller image or different settings." # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown(""" # Medical Image Analysis Assistant Upload a medical image and receive a professional description from an AI radiographer. """) with gr.Row(): with gr.Column(): image_input = gr.Image( type="pil", label="Upload Medical Image", max_pixels=1500000 # Limit image size ) instruction_input = gr.Textbox( label="Custom Instruction (optional)", placeholder="You are an expert radiographer. Describe accurately what you see in this image.", lines=2 ) submit_btn = gr.Button("Analyze Image") with gr.Column(): output_text = gr.Textbox(label="Analysis Result", lines=10) # Handle the submission submit_btn.click( fn=analyze_image, inputs=[image_input, instruction_input], outputs=output_text ) gr.Markdown(""" ### Notes: - The model runs on CPU and may take several moments to process each image - For best results, upload images smaller than 1.5MP - Please be patient during processing """) # Launch the app if __name__ == "__main__": demo.launch()