unit_4_GAIA_challenge

Paused

File size: 11,740 Bytes

import asyncio
import base64
import pickle
from io import BytesIO
from pathlib import Path
from typing import Any
import os

import cv2
import polars as pl
from dotenv import find_dotenv, load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langgraph.errors import GraphRecursionError
from langgraph.prebuilt import create_react_agent
from PIL import Image
from pydantic import BaseModel, Field
from smolagents import (
    DuckDuckGoSearchTool,
    Tool,
    VisitWebpageTool,
    WikipediaSearchTool,
)
os.environ["OPENAI_API_KEY"] = "sk-dumykey"
# import nest_asyncio
# nest_asyncio.apply()

_ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True)
with open("all_questions.pkl", "rb") as f:
    all_questions = pickle.load(f)
lang_model = init_chat_model(
    model="gpt-4.1-mini", model_provider="openai", temperature=0.2
)


def search_wikipedia(query: str) -> str:
    """Tries to search for a wikipedia page relevant to the query and if it finds
    then it returns the content of this page."""
    wiki_search = WikipediaSearchTool(user_agent="WikiAssistant (merlin@example.com)")
    content = wiki_search(query)
    return content


def visit_web_page(url: str) -> str:
    """Use this tool to visit websites."""
    visit_webpage = VisitWebpageTool(max_output_length=60_000)
    web_content = visit_webpage(url)
    return web_content


def read_excel_or_csv(filepath: str) -> str:
    """Reads an excel or csv file and returns the content as str."""
    if Path(filepath).suffix in {".xlsx", ".xls"}:
        df = pl.read_excel(source=filepath)
    df = pl.read_csv(source=filepath)
    content_str = df.to_dict(as_series=False).__str__()
    return content_str


def python_code_interpreter(filepath: str) -> Any:
    """Returns the output of a python code."""
    with open(filepath, "r") as f:
        code = f.readlines()
    code_result = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": f"What's the result of this code: {code}. Return only the output without any explanation.",
                        },
                    ]
                )
            ]
        ]
    )
    return code_result.generations[0][0].text

def python_executor(code_str:str) -> str:
    """This executes python code. The code must be a string.
    For any calculations always use numpy."""
    lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
    code_res = lpe(code_action=code_str)[0]
    return code_res

stt_tool = Tool.from_space(
    space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
    description="Speech to Text Tool",
    name="stt_tool",
)


def call_stt_tool(file_url: str) -> str:
    """Speech to text tool."""
    transcribed_text = stt_tool(file_url)
    return transcribed_text


def encode_image_to_base64(image_path: str) -> bytes:
    image = Image.open(image_path)
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def image_tool(file_url: str) -> str:
    """Describes an image in detail."""
    img_resp = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": "Describe the image in detail and return only the description without any additional content.",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path=file_url)}"
                            },
                        },
                    ]
                )
            ]
        ]
    )

    return img_resp.generations[0][0].text


# image_tool(file_url="cca530fc-4052-43b2-b130-b30968d8aa44.png")
# https://cookbook.openai.com/examples/gpt_with_vision_for_video_understanding
def youtube_video_tool(url: str) -> str:
    """Answers questions about youtube videos.
    URLs must be provided to this tool."""
    yt_vid_mapping = {
        "https://www.youtube.com/watch?v=L1vXCYZAYYM": "penguin.mp4",
        "https://www.youtube.com/watch?v=1htKBjuUWec": "coffee.mp4",
    }
    video = cv2.VideoCapture(filename=yt_vid_mapping[url])

    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(s=buffer).decode(encoding="utf-8"))

    video.release()
    vid_resp = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": (
                                """Examine the video.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\""""
                            ),
                        },
                        *[
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{frame}"},
                            }
                            for frame in base64Frames[0::25]
                        ],
                    ],
                )
            ]
        ]
    )
    return vid_resp.generations[0][0].text


def web_search_tool(query: str) -> str:
    """Use a search engine to search the web for general information."""
    ddg_tool = DuckDuckGoSearchTool(max_results=5)
    search_res = ddg_tool(query)
    return search_res


class AnswerFormat(BaseModel):
    """Pydantic model for the answer format instructions.

    Attributes:
        thoughts (str): The reasoning or thoughts before the answer.
        answer (str | int | list[str | int]): The final answer, following strict formatting rules.
    """

    thoughts: str = Field(..., description="Report your thoughts before the answer.")
    answer: str | int | list[str | int] = Field(
        ...,
        description=(
            "The answer should be a number (no commas or units), "
            "a few words (no articles or abbreviations), "
            "or a comma-separated list of numbers/strings, "
            "following all specified formatting rules."
        ),
    )



# https://cookbook.openai.com/examples/gpt4-1_prompting_guide
SYSTEM_PROMPT = """
You are an expert agent - please keep going until the user’s query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved. 


# Instructions
- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
- For any dataframe related tasks, always use the `read_excel_or_csv` tool.
- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
- Always verify your answers.


#Output Format
- YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations(e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
- Leverage the `AnswerFormat` pydantic class to output the answer.

# Example
## User
Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal\'c say in response to the question "Isn\'t that hot?"

## Assistant
<Internal thought> First let me extract the youtube url and then use the `youtube_video_tool` to answer this query.</Internal thought>
### Tool Calls
youtube_video_tool(https://www.youtube.com/watch?v=1htKBjuUWec)

// After tool call, the assistant would follow up with the response from the tool:

<Internal thought> I need to format the tool response as per the expected output. </Internal thought>

## Assistant response
### Message
"The correct answer."

# Example 2
## User
What's 2 +2 ?

## Assistant response
### Message
4

"""


agent = create_react_agent(
    model=lang_model,
    tools=[
        search_wikipedia,
        visit_web_page,
        read_excel_or_csv,
        python_executor,
        python_code_interpreter,
        call_stt_tool,
        image_tool,
        youtube_video_tool,
        web_search_tool,
    ],
    prompt=SYSTEM_PROMPT,
    response_format=AnswerFormat,
)


# recursion_limit = 10
# agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
# all_questions[0]

async def run_agent():
    results = []
    for q in all_questions:
        try:
            answer = await agent.ainvoke(
            # answer = agent_w_recursion_limit.invoke(
                input={
                    "messages": f"""Complete the following task: {q["question"]}. Relevant file: {
                        q["file_name"]
                        if q["file_name"]
                        else "There's no relevant file to use."
                    }"""
                }
            )
            results.append(answer)
        except GraphRecursionError:
            print("❌ Agent stopped due to max iterations.")
            results.append(q["task_id"])
    return results

# with open("results_gpt_mini.pkl", "wb") as f:
#     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
# results = asyncio.run(run_agent())
# answers = [{"task_id":j['task_id'],
#             "submitted_answer": results[i]["structured_response"].answer
#             if isinstance(results[i], dict) else "No answer"} 
#  for i,j in enumerate(all_questions)]