import os import logging from llama_index.core.agent.workflow import FunctionAgent from llama_index.llms.google_genai import GoogleGenAI # Setup logging logger = logging.getLogger(__name__) # Helper function to load prompt from file def load_prompt_from_file(filename="../prompts/image_analyzer_prompt.txt") -> str: """Loads the system prompt from a text file.""" try: # Assuming the prompt file is in the same directory as the agent script script_dir = os.path.dirname(__file__) prompt_path = os.path.join(script_dir, filename) with open(prompt_path, "r") as f: prompt = f.read() logger.info(f"Successfully loaded system prompt from {prompt_path}") return prompt except FileNotFoundError: logger.error(f"Prompt file {filename} not found at {prompt_path}. Using fallback prompt.") # Fallback minimal prompt return "You are an image analyzer. Describe the image factually." except Exception as e: logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True) return "You are an image analyzer. Describe the image factually." def initialize_image_analyzer_agent() -> FunctionAgent: """ Create an agent that orchestrates image analysis. Uses Gemini Pro multimodal capabilities directly without explicit tools. Configuration and prompt are loaded from environment/file. """ logger.info("Initializing ImageAnalyzerAgent...") # Configuration from environment variables llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found in environment variables.") raise ValueError("GEMINI_API_KEY must be set") try: llm = GoogleGenAI( api_key=gemini_api_key, model=llm_model_name, ) logger.info(f"Using LLM model: {llm_model_name}") # Load system prompt from file system_prompt = load_prompt_from_file() # Note: This agent is a FunctionAgent but doesn't explicitly define tools. # It relies on the LLM (Gemini 1.5 Pro) to understand the system prompt # and perform the analysis when an image is passed in the ChatMessage blocks. agent = FunctionAgent( name="image_analyzer_agent", description=( "ImageAnalyzerAgent inspects image files using its multimodal capabilities, " "interpreting the visual content according to a detailed factual analysis prompt." ), llm=llm, system_prompt=system_prompt, # No explicit tools needed if relying on direct multimodal LLM call # tools=[], can_handoff_to=["planner_agent", "research_agent", "reasoning_agent", "figure_interpretation_agent"], ) logger.info("ImageAnalyzerAgent initialized successfully.") return agent except Exception as e: logger.error(f"Error during ImageAnalyzerAgent initialization: {e}", exc_info=True) raise # Example usage (for testing if run directly) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger.info("Running image_analyzer_agent.py directly for testing...") # Ensure API key is set for testing if not os.getenv("GEMINI_API_KEY"): print("Error: GEMINI_API_KEY environment variable not set. Cannot run test.") else: try: test_agent = initialize_image_analyzer_agent() print("Image Analyzer Agent initialized successfully for testing.") # To test further, you would need to construct a ChatMessage with an ImageBlock # and run agent.chat(message) except Exception as e: print(f"Error during testing: {e}")