File size: 11,740 Bytes
3aabed5 bad07dd 3aabed5 bad07dd 13ce9d3 8a172a3 3aabed5 fe550a4 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 3aabed5 1e539bc 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 3aabed5 2e53410 fe550a4 3aabed5 2e53410 fe550a4 3aabed5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
import asyncio
import base64
import pickle
from io import BytesIO
from pathlib import Path
from typing import Any
import os
import cv2
import polars as pl
from dotenv import find_dotenv, load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langgraph.errors import GraphRecursionError
from langgraph.prebuilt import create_react_agent
from PIL import Image
from pydantic import BaseModel, Field
from smolagents import (
DuckDuckGoSearchTool,
Tool,
VisitWebpageTool,
WikipediaSearchTool,
)
os.environ["OPENAI_API_KEY"] = "sk-dumykey"
# import nest_asyncio
# nest_asyncio.apply()
_ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True)
with open("all_questions.pkl", "rb") as f:
all_questions = pickle.load(f)
lang_model = init_chat_model(
model="gpt-4.1-mini", model_provider="openai", temperature=0.2
)
def search_wikipedia(query: str) -> str:
"""Tries to search for a wikipedia page relevant to the query and if it finds
then it returns the content of this page."""
wiki_search = WikipediaSearchTool(user_agent="WikiAssistant (merlin@example.com)")
content = wiki_search(query)
return content
def visit_web_page(url: str) -> str:
"""Use this tool to visit websites."""
visit_webpage = VisitWebpageTool(max_output_length=60_000)
web_content = visit_webpage(url)
return web_content
def read_excel_or_csv(filepath: str) -> str:
"""Reads an excel or csv file and returns the content as str."""
if Path(filepath).suffix in {".xlsx", ".xls"}:
df = pl.read_excel(source=filepath)
df = pl.read_csv(source=filepath)
content_str = df.to_dict(as_series=False).__str__()
return content_str
def python_code_interpreter(filepath: str) -> Any:
"""Returns the output of a python code."""
with open(filepath, "r") as f:
code = f.readlines()
code_result = lang_model.generate(
messages=[
[
HumanMessage(
content=[
{
"type": "text",
"text": f"What's the result of this code: {code}. Return only the output without any explanation.",
},
]
)
]
]
)
return code_result.generations[0][0].text
def python_executor(code_str:str) -> str:
"""This executes python code. The code must be a string.
For any calculations always use numpy."""
lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
code_res = lpe(code_action=code_str)[0]
return code_res
stt_tool = Tool.from_space(
space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
description="Speech to Text Tool",
name="stt_tool",
)
def call_stt_tool(file_url: str) -> str:
"""Speech to text tool."""
transcribed_text = stt_tool(file_url)
return transcribed_text
def encode_image_to_base64(image_path: str) -> bytes:
image = Image.open(image_path)
buffered = BytesIO()
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def image_tool(file_url: str) -> str:
"""Describes an image in detail."""
img_resp = lang_model.generate(
messages=[
[
HumanMessage(
content=[
{
"type": "text",
"text": "Describe the image in detail and return only the description without any additional content.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path=file_url)}"
},
},
]
)
]
]
)
return img_resp.generations[0][0].text
# image_tool(file_url="cca530fc-4052-43b2-b130-b30968d8aa44.png")
# https://cookbook.openai.com/examples/gpt_with_vision_for_video_understanding
def youtube_video_tool(url: str) -> str:
"""Answers questions about youtube videos.
URLs must be provided to this tool."""
yt_vid_mapping = {
"https://www.youtube.com/watch?v=L1vXCYZAYYM": "penguin.mp4",
"https://www.youtube.com/watch?v=1htKBjuUWec": "coffee.mp4",
}
video = cv2.VideoCapture(filename=yt_vid_mapping[url])
base64Frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(s=buffer).decode(encoding="utf-8"))
video.release()
vid_resp = lang_model.generate(
messages=[
[
HumanMessage(
content=[
{
"type": "text",
"text": (
"""Examine the video.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\""""
),
},
*[
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{frame}"},
}
for frame in base64Frames[0::25]
],
],
)
]
]
)
return vid_resp.generations[0][0].text
def web_search_tool(query: str) -> str:
"""Use a search engine to search the web for general information."""
ddg_tool = DuckDuckGoSearchTool(max_results=5)
search_res = ddg_tool(query)
return search_res
class AnswerFormat(BaseModel):
"""Pydantic model for the answer format instructions.
Attributes:
thoughts (str): The reasoning or thoughts before the answer.
answer (str | int | list[str | int]): The final answer, following strict formatting rules.
"""
thoughts: str = Field(..., description="Report your thoughts before the answer.")
answer: str | int | list[str | int] = Field(
...,
description=(
"The answer should be a number (no commas or units), "
"a few words (no articles or abbreviations), "
"or a comma-separated list of numbers/strings, "
"following all specified formatting rules."
),
)
# https://cookbook.openai.com/examples/gpt4-1_prompting_guide
SYSTEM_PROMPT = """
You are an expert agent - please keep going until the user’s query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved.
# Instructions
- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
- For any dataframe related tasks, always use the `read_excel_or_csv` tool.
- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
- Always verify your answers.
#Output Format
- YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations(e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
- Leverage the `AnswerFormat` pydantic class to output the answer.
# Example
## User
Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal\'c say in response to the question "Isn\'t that hot?"
## Assistant
<Internal thought> First let me extract the youtube url and then use the `youtube_video_tool` to answer this query.</Internal thought>
### Tool Calls
youtube_video_tool(https://www.youtube.com/watch?v=1htKBjuUWec)
// After tool call, the assistant would follow up with the response from the tool:
<Internal thought> I need to format the tool response as per the expected output. </Internal thought>
## Assistant response
### Message
"The correct answer."
# Example 2
## User
What's 2 +2 ?
## Assistant response
### Message
4
"""
agent = create_react_agent(
model=lang_model,
tools=[
search_wikipedia,
visit_web_page,
read_excel_or_csv,
python_executor,
python_code_interpreter,
call_stt_tool,
image_tool,
youtube_video_tool,
web_search_tool,
],
prompt=SYSTEM_PROMPT,
response_format=AnswerFormat,
)
# recursion_limit = 10
# agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
# all_questions[0]
async def run_agent():
results = []
for q in all_questions:
try:
answer = await agent.ainvoke(
# answer = agent_w_recursion_limit.invoke(
input={
"messages": f"""Complete the following task: {q["question"]}. Relevant file: {
q["file_name"]
if q["file_name"]
else "There's no relevant file to use."
}"""
}
)
results.append(answer)
except GraphRecursionError:
print("❌ Agent stopped due to max iterations.")
results.append(q["task_id"])
return results
# with open("results_gpt_mini.pkl", "wb") as f:
# pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
# results = asyncio.run(run_agent())
# answers = [{"task_id":j['task_id'],
# "submitted_answer": results[i]["structured_response"].answer
# if isinstance(results[i], dict) else "No answer"}
# for i,j in enumerate(all_questions)]
|