GAIA-Agent

Sleeping

File size: 6,758 Bytes

f0eb1da
 
 
 
 
 
 
fb62e9e
 
 
 
 
 
 
 
 
 
 
6138f05
f0eb1da
fb62e9e
6138f05
d10ae74
5359065
f0eb1da
fb62e9e
2ae392c
fb62e9e
 
 
 
 
6138f05
2ae392c
d10ae74
2f68dd7
b17b706
d10ae74
 
 
 
 
 
 
 
b17b706
 
 
 
 
3929291
d10ae74
f0eb1da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d10ae74
 
 
f0eb1da
d10ae74
 
 
f0eb1da
fb62e9e
 
 
 
 
 
 
 
 
d10ae74
 
6138f05
 
f0eb1da
d10ae74
 
6138f05
 
f0eb1da
2f68dd7
d10ae74
 
 
 
b17b706
d10ae74
2f68dd7
d10ae74
 
b17b706
d10ae74
fb62e9e
d10ae74
f0eb1da
 
 
d10ae74
f0eb1da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d10ae74
 
3929291
d10ae74
 
 
 
 
f0eb1da
d10ae74
 
 
 
f0eb1da
d10ae74
 
f0eb1da
d10ae74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0eb1da
d10ae74
 
f0eb1da
d10ae74
 
f0eb1da
d10ae74
 
 
 
 
 
 
 
 
f0eb1da
 
d10ae74
f0eb1da
 
 
d10ae74
f0eb1da
d10ae74
2ae392c
 
d10ae74
2ae392c
d10ae74
2ae392c
d10ae74
 
f0eb1da
d10ae74
fb62e9e
d10ae74
 
f0eb1da
d10ae74
f0eb1da
d10ae74

from smolagents import (
    CodeAgent,
    LiteLLMModel,
    DuckDuckGoSearchTool,
    PythonInterpreterTool,
    VisitWebpageTool,
)
from src.tools import (
    transcribe_audio_file,
    transcribe_from_youtube,
    read_excel_file,
    wiki_search,
    multiply,
    add,
    subtract,
    divide,
    modulus,
)
import os
from typing import List
from PIL import Image
from dotenv import load_dotenv


load_dotenv()

SYSTEM_PROMPT = """
You are a helpful assistant tasked with answering questions using a set of tools. 
Now, I will ask you a question. Report your thoughts, and finish your answer with the following template: 
FINAL ANSWER: [YOUR FINAL ANSWER]. 
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
Your answer should only start with "FINAL ANSWER: ", then follows with the answer. 
"""


class CustomAgent:
    def __init__(
        self,
        model_id: str = "gemini/gemini-2.0-flash",
        additional_imports: List[str] = None,
        logging=False,
        max_steps=10,
        verbose: bool = False,
        executor_type: str = "local",
        timeout: int = 120,
    ):
        """
        Initialize the CustomAgent with a model and tools.
        If no model is provided, a default one is used.
        """
        self.logging = logging
        self.verbose = verbose
        self.imports = [
            "pandas",
            "numpy",
            "io",
            "datetime",
            "json",
            "re",
            "math",
            "os",
            "requests",
            "csv",
            "urllib",
            "youtube-transcript-api",
            "SpeechRecognition",
            "pydub",
        ]
        if additional_imports:
            self.imports.extend(additional_imports)

        # Initialize tools
        self.tools = [
            DuckDuckGoSearchTool(),
            PythonInterpreterTool(),
            VisitWebpageTool(),
            wiki_search,
            transcribe_audio_file,
            transcribe_from_youtube,
            read_excel_file,
            multiply,
            add,
            subtract,
            divide,
            modulus,
        ]

        # Initialize the model
        model = LiteLLMModel(
            model_id=model_id,
            api_key=os.getenv("GEMINI_API_KEY"),
            timeout=timeout,
        )

        # Initialize the CodeAgent
        self.agent = CodeAgent(
            model=model,
            tools=self.tools,
            additional_authorized_imports=self.imports,
            executor_type=executor_type,
            max_steps=max_steps,
            verbosity_level=2 if verbose else 0,
        )
        if self.verbose:
            print("CustomAgent initialized.")

    def forward(self, question: str, file_path) -> str:
        print(f"QUESTION: {question[:100]}...")
        try:
            full_prompt = f"""Question: {question}
                
            {SYSTEM_PROMPT}"""
            if file_path:
                file_path_ext = os.path.splitext(file_path)[1]
                if file_path_ext.lower() in [".jpg", ".jpeg", ".png"]:
                    image = Image.open(file_path).convert("RGB")
                    answer = self.agent.run(full_prompt, images=[image])
                elif file_path_ext.lower() in [".txt", ".py"]:
                    with open(file_path, "r") as f:
                        content = f.read()
                    full_prompt = f"""Question: {question}
                    File content: ```{content}```

                    {SYSTEM_PROMPT}"""
                    answer = self.agent.run(full_prompt)
                else:
                    full_prompt = f"""Question: {question}
                    File path: {file_path}

                    {SYSTEM_PROMPT}"""
                    answer = self.agent.run(full_prompt)
            else:
                answer = self.agent.run(full_prompt)
            answer = self._clean_answer(answer)
            return answer

        except Exception as e:
            error_msg = f"Error answering question: {e}"
            if self.verbose:
                print(error_msg)
            return error_msg

    def _clean_answer(self, answer: any) -> str:
        """
        Clean up the answer to remove common prefixes and formatting
        that models often add but that can cause exact match failures.

        Args:
            answer: The raw answer from the model

        Returns:
            The cleaned answer as a string
        """
        # Convert non-string types to strings
        if not isinstance(answer, str):
            if isinstance(answer, float):
                if answer.is_integer():
                    formatted_answer = str(int(answer))
                else:
                    if abs(answer) >= 1000:
                        formatted_answer = f"${answer:,.2f}"
                    else:
                        formatted_answer = str(answer)
                return formatted_answer
            elif isinstance(answer, int):
                return str(answer)
            else:
                return str(answer)

        # Normalize whitespace
        answer = answer.strip()

        # Remove common prefixes and formatting that models add
        prefixes_to_remove = [
            "The answer is ",
            "Answer: ",
            "Final answer: ",
            "The result is ",
            "To answer this question: ",
            "Based on the information provided, ",
            "According to the information: ",
        ]
        for prefix in prefixes_to_remove:
            if answer.startswith(prefix):
                answer = answer[len(prefix) :].strip()

        # Remove quotes if they wrap the entire answer
        if (answer.startswith('"') and answer.endswith('"')) or (
            answer.startswith("'") and answer.endswith("'")
        ):
            answer = answer[1:-1].strip()

        return answer


def get_config():
    """
    Get the agent configuration based on environment variables
    """
    # Default configuration
    config = {
        "model_id": "gemini/gemini-2.5-flash-preview-04-17",
        "logging": False,
        "max_steps": 10,
        "verbose": False,
        "executor_type": "local",
        "timeout": 120,
    }

    return config