File size: 11,740 Bytes
3aabed5
 
 
 
 
 
bad07dd
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bad07dd
13ce9d3
8a172a3
3aabed5
fe550a4
3aabed5
 
 
2e53410
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e53410
3aabed5
2e53410
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e53410
 
 
 
 
 
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e539bc
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e53410
3aabed5
 
 
 
2e53410
 
 
 
3aabed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e53410
 
3aabed5
 
 
 
 
 
 
2e53410
3aabed5
 
 
 
 
 
 
 
2e53410
 
3aabed5
 
2e53410
 
fe550a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3aabed5
2e53410
 
fe550a4
 
 
 
 
3aabed5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import asyncio
import base64
import pickle
from io import BytesIO
from pathlib import Path
from typing import Any
import os

import cv2
import polars as pl
from dotenv import find_dotenv, load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langgraph.errors import GraphRecursionError
from langgraph.prebuilt import create_react_agent
from PIL import Image
from pydantic import BaseModel, Field
from smolagents import (
    DuckDuckGoSearchTool,
    Tool,
    VisitWebpageTool,
    WikipediaSearchTool,
)
os.environ["OPENAI_API_KEY"] = "sk-dumykey"
# import nest_asyncio
# nest_asyncio.apply()

_ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True)
with open("all_questions.pkl", "rb") as f:
    all_questions = pickle.load(f)
lang_model = init_chat_model(
    model="gpt-4.1-mini", model_provider="openai", temperature=0.2
)


def search_wikipedia(query: str) -> str:
    """Tries to search for a wikipedia page relevant to the query and if it finds
    then it returns the content of this page."""
    wiki_search = WikipediaSearchTool(user_agent="WikiAssistant (merlin@example.com)")
    content = wiki_search(query)
    return content


def visit_web_page(url: str) -> str:
    """Use this tool to visit websites."""
    visit_webpage = VisitWebpageTool(max_output_length=60_000)
    web_content = visit_webpage(url)
    return web_content


def read_excel_or_csv(filepath: str) -> str:
    """Reads an excel or csv file and returns the content as str."""
    if Path(filepath).suffix in {".xlsx", ".xls"}:
        df = pl.read_excel(source=filepath)
    df = pl.read_csv(source=filepath)
    content_str = df.to_dict(as_series=False).__str__()
    return content_str


def python_code_interpreter(filepath: str) -> Any:
    """Returns the output of a python code."""
    with open(filepath, "r") as f:
        code = f.readlines()
    code_result = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": f"What's the result of this code: {code}. Return only the output without any explanation.",
                        },
                    ]
                )
            ]
        ]
    )
    return code_result.generations[0][0].text

def python_executor(code_str:str) -> str:
    """This executes python code. The code must be a string.
    For any calculations always use numpy."""
    lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
    code_res = lpe(code_action=code_str)[0]
    return code_res

stt_tool = Tool.from_space(
    space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
    description="Speech to Text Tool",
    name="stt_tool",
)


def call_stt_tool(file_url: str) -> str:
    """Speech to text tool."""
    transcribed_text = stt_tool(file_url)
    return transcribed_text


def encode_image_to_base64(image_path: str) -> bytes:
    image = Image.open(image_path)
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def image_tool(file_url: str) -> str:
    """Describes an image in detail."""
    img_resp = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": "Describe the image in detail and return only the description without any additional content.",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path=file_url)}"
                            },
                        },
                    ]
                )
            ]
        ]
    )

    return img_resp.generations[0][0].text


# image_tool(file_url="cca530fc-4052-43b2-b130-b30968d8aa44.png")
# https://cookbook.openai.com/examples/gpt_with_vision_for_video_understanding
def youtube_video_tool(url: str) -> str:
    """Answers questions about youtube videos.
    URLs must be provided to this tool."""
    yt_vid_mapping = {
        "https://www.youtube.com/watch?v=L1vXCYZAYYM": "penguin.mp4",
        "https://www.youtube.com/watch?v=1htKBjuUWec": "coffee.mp4",
    }
    video = cv2.VideoCapture(filename=yt_vid_mapping[url])

    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(s=buffer).decode(encoding="utf-8"))

    video.release()
    vid_resp = lang_model.generate(
        messages=[
            [
                HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": (
                                """Examine the video.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\""""
                            ),
                        },
                        *[
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{frame}"},
                            }
                            for frame in base64Frames[0::25]
                        ],
                    ],
                )
            ]
        ]
    )
    return vid_resp.generations[0][0].text


def web_search_tool(query: str) -> str:
    """Use a search engine to search the web for general information."""
    ddg_tool = DuckDuckGoSearchTool(max_results=5)
    search_res = ddg_tool(query)
    return search_res


class AnswerFormat(BaseModel):
    """Pydantic model for the answer format instructions.

    Attributes:
        thoughts (str): The reasoning or thoughts before the answer.
        answer (str | int | list[str | int]): The final answer, following strict formatting rules.
    """

    thoughts: str = Field(..., description="Report your thoughts before the answer.")
    answer: str | int | list[str | int] = Field(
        ...,
        description=(
            "The answer should be a number (no commas or units), "
            "a few words (no articles or abbreviations), "
            "or a comma-separated list of numbers/strings, "
            "following all specified formatting rules."
        ),
    )



# https://cookbook.openai.com/examples/gpt4-1_prompting_guide
SYSTEM_PROMPT = """
You are an expert agent - please keep going until the user’s query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved. 


# Instructions
- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
- For any dataframe related tasks, always use the `read_excel_or_csv` tool.
- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
- Always verify your answers.


#Output Format
- YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations(e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
- Leverage the `AnswerFormat` pydantic class to output the answer.

# Example
## User
Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal\'c say in response to the question "Isn\'t that hot?"

## Assistant
<Internal thought> First let me extract the youtube url and then use the `youtube_video_tool` to answer this query.</Internal thought>
### Tool Calls
youtube_video_tool(https://www.youtube.com/watch?v=1htKBjuUWec)

// After tool call, the assistant would follow up with the response from the tool:

<Internal thought> I need to format the tool response as per the expected output. </Internal thought>

## Assistant response
### Message
"The correct answer."

# Example 2
## User
What's 2 +2 ?

## Assistant response
### Message
4

"""


agent = create_react_agent(
    model=lang_model,
    tools=[
        search_wikipedia,
        visit_web_page,
        read_excel_or_csv,
        python_executor,
        python_code_interpreter,
        call_stt_tool,
        image_tool,
        youtube_video_tool,
        web_search_tool,
    ],
    prompt=SYSTEM_PROMPT,
    response_format=AnswerFormat,
)


# recursion_limit = 10
# agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
# all_questions[0]

async def run_agent():
    results = []
    for q in all_questions:
        try:
            answer = await agent.ainvoke(
            # answer = agent_w_recursion_limit.invoke(
                input={
                    "messages": f"""Complete the following task: {q["question"]}. Relevant file: {
                        q["file_name"]
                        if q["file_name"]
                        else "There's no relevant file to use."
                    }"""
                }
            )
            results.append(answer)
        except GraphRecursionError:
            print("❌ Agent stopped due to max iterations.")
            results.append(q["task_id"])
    return results

# with open("results_gpt_mini.pkl", "wb") as f:
#     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
# results = asyncio.run(run_agent())
# answers = [{"task_id":j['task_id'],
#             "submitted_answer": results[i]["structured_response"].answer
#             if isinstance(results[i], dict) else "No answer"} 
#  for i,j in enumerate(all_questions)]