hermes / tests /test_agent_loop_tool_calling.py
lenson78's picture
initial upload: v2026.3.23 with HF Spaces deployment
9aa5185 verified
"""Integration tests for HermesAgentLoop tool calling.
Tests the full agent loop with real LLM calls via OpenRouter.
Uses stepfun/step-3.5-flash:free by default (zero cost), falls back
to anthropic/claude-sonnet-4 if the free model is unavailable.
These tests verify:
1. Single tool call: model calls a tool, gets result, responds
2. Multi-tool call: model calls multiple tools in one turn
3. Multi-turn: model calls tools across multiple turns
4. Unknown tool rejection: model calling a non-existent tool gets an error
5. Max turns: loop stops when max_turns is reached
6. No tools: model responds without calling any tools
7. Tool error handling: tool execution errors are captured
Run:
pytest tests/test_agent_loop_tool_calling.py -v
pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test
"""
import asyncio
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Set
from unittest.mock import patch
import pytest
pytestmark = pytest.mark.skip(reason="Live API integration test — hangs in batch runs")
# Ensure repo root is importable
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
try:
from environments.agent_loop import AgentResult, HermesAgentLoop
from atroposlib.envs.server_handling.openai_server import OpenAIServer # noqa: F401
except ImportError:
pytest.skip("atroposlib not installed", allow_module_level=True)
# =========================================================================
# Test infrastructure
# =========================================================================
# Models to try, in order of preference (free first)
_MODELS = [
"stepfun/step-3.5-flash:free",
"google/gemini-2.0-flash-001",
"anthropic/claude-sonnet-4",
]
def _get_api_key():
key = os.getenv("OPENROUTER_API_KEY", "")
if not key:
pytest.skip("OPENROUTER_API_KEY not set")
return key
def _make_server(model: str = None):
"""Create an OpenAI server for testing."""
from atroposlib.envs.server_handling.openai_server import OpenAIServer
from atroposlib.envs.server_handling.server_manager import APIServerConfig
config = APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name=model or _MODELS[0],
server_type="openai",
api_key=_get_api_key(),
health_check=False,
)
return OpenAIServer(config)
async def _try_models(test_fn):
"""Try running a test with each model until one works."""
last_error = None
for model in _MODELS:
try:
server = _make_server(model)
return await test_fn(server, model)
except Exception as e:
last_error = e
if "rate" in str(e).lower() or "limit" in str(e).lower():
continue # Rate limited, try next model
raise # Real error
pytest.skip(f"All models failed. Last error: {last_error}")
# =========================================================================
# Fake tools for testing
# =========================================================================
# Simple calculator tool
CALC_TOOL = {
"type": "function",
"function": {
"name": "calculate",
"description": "Calculate a math expression. Returns the numeric result.",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Math expression to evaluate, e.g. '2 + 3'"
}
},
"required": ["expression"],
},
},
}
# Weather lookup tool
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a city. Returns temperature and conditions.",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "City name, e.g. 'Tokyo'"
}
},
"required": ["city"],
},
},
}
# Lookup tool (always succeeds)
LOOKUP_TOOL = {
"type": "function",
"function": {
"name": "lookup",
"description": "Look up a fact. Returns a short answer string.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "What to look up"
}
},
"required": ["query"],
},
},
}
# Error tool (always fails)
ERROR_TOOL = {
"type": "function",
"function": {
"name": "failing_tool",
"description": "A tool that always fails with an error.",
"parameters": {
"type": "object",
"properties": {
"input": {"type": "string"}
},
"required": ["input"],
},
},
}
def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str:
"""Handle fake tool calls for testing."""
if tool_name == "calculate":
expr = args.get("expression", "0")
try:
# Safe eval for simple math
result = eval(expr, {"__builtins__": {}}, {})
return json.dumps({"result": result})
except Exception as e:
return json.dumps({"error": str(e)})
elif tool_name == "get_weather":
city = args.get("city", "Unknown")
# Return canned weather
return json.dumps({
"city": city,
"temperature": 22,
"conditions": "sunny",
"humidity": 45,
})
elif tool_name == "lookup":
query = args.get("query", "")
return json.dumps({"answer": f"The answer to '{query}' is 42."})
elif tool_name == "failing_tool":
raise RuntimeError("This tool always fails!")
return json.dumps({"error": f"Unknown tool: {tool_name}"})
# =========================================================================
# Tests
# =========================================================================
@pytest.mark.asyncio
async def test_single_tool_call():
"""Model should call a single tool, get the result, and respond."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[WEATHER_TOOL],
valid_tool_names={"get_weather"},
max_turns=5,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
assert isinstance(result, AgentResult)
assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}"
# Verify a tool call happened
tool_calls_found = False
for msg in result.messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
if tc["function"]["name"] == "get_weather":
tool_calls_found = True
args = json.loads(tc["function"]["arguments"])
assert "city" in args
assert tool_calls_found, "Model should have called get_weather"
# Verify tool result is in conversation
tool_results = [m for m in result.messages if m.get("role") == "tool"]
assert len(tool_results) >= 1, "Should have at least one tool result"
# Verify the final response references the weather
final_msg = result.messages[-1]
assert final_msg["role"] == "assistant"
assert final_msg["content"], "Final response should have content"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_multi_tool_single_turn():
"""Model should call multiple tools in a single turn."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[WEATHER_TOOL, CALC_TOOL],
valid_tool_names={"get_weather", "calculate"},
max_turns=5,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": (
"I need two things at once: "
"1) What's the weather in Paris? Use get_weather. "
"2) What is 15 * 7? Use calculate. "
"Call BOTH tools in a single response."
)},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# Count distinct tools called
tools_called = set()
for msg in result.messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
tools_called.add(tc["function"]["name"])
# At minimum, both tools should have been called (maybe in different turns)
assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}"
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_multi_turn_conversation():
"""Agent should handle multiple turns of tool calls."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[LOOKUP_TOOL, CALC_TOOL],
valid_tool_names={"lookup", "calculate"},
max_turns=10,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": (
"First, use the lookup tool to look up 'meaning of life'. "
"Then use calculate to compute 6 * 7. "
"Do these in separate tool calls, one at a time."
)},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# Should have used both tools
tools_called = set()
for msg in result.messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc in msg["tool_calls"]:
tools_called.add(tc["function"]["name"])
assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}"
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
# Should finish naturally
assert result.finished_naturally, "Should finish naturally after answering"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_unknown_tool_rejected():
"""If the model calls a tool not in valid_tool_names, it gets an error."""
async def _run(server, model):
# Only allow "calculate" but give schema for both
agent = HermesAgentLoop(
server=server,
tool_schemas=[CALC_TOOL, WEATHER_TOOL],
valid_tool_names={"calculate"}, # weather NOT allowed
max_turns=5,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": "What's the weather in London? Use get_weather."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# Check if get_weather was called and rejected
if result.tool_errors:
weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"]
assert len(weather_errors) > 0, "get_weather should have been rejected"
assert "Unknown tool" in weather_errors[0].error
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_max_turns_limit():
"""Agent should stop after max_turns even if model keeps calling tools."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[LOOKUP_TOOL],
valid_tool_names={"lookup"},
max_turns=2, # Very low limit
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": (
"Keep looking up facts. Look up 'fact 1', then 'fact 2', "
"then 'fact 3', then 'fact 4'. Do them one at a time."
)},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}"
assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_no_tools_direct_response():
"""When no tools are useful, model should respond directly."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[WEATHER_TOOL],
valid_tool_names={"get_weather"},
max_turns=5,
temperature=0.0,
max_tokens=200,
)
messages = [
{"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
assert result.finished_naturally, "Should finish naturally with a direct response"
assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}"
final = result.messages[-1]
assert final["role"] == "assistant"
assert final["content"], "Should have text content"
assert "4" in final["content"], "Should contain the answer '4'"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_tool_error_handling():
"""Tool execution errors should be captured and reported to the model."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[ERROR_TOOL],
valid_tool_names={"failing_tool"},
max_turns=5,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "user", "content": "Please call the failing_tool with input 'test'."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# The tool error should be recorded
assert len(result.tool_errors) >= 1, "Should have at least one tool error"
assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error
# The error should be in the conversation as a tool result
tool_results = [m for m in result.messages if m.get("role") == "tool"]
assert len(tool_results) >= 1
error_result = json.loads(tool_results[0]["content"])
assert "error" in error_result
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_agent_result_structure():
"""Verify the AgentResult has all expected fields populated."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[CALC_TOOL],
valid_tool_names={"calculate"},
max_turns=5,
temperature=0.0,
max_tokens=300,
)
messages = [
{"role": "user", "content": "What is 3 + 4? Use the calculate tool."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# Structural checks
assert isinstance(result, AgentResult)
assert isinstance(result.messages, list)
assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)"
assert isinstance(result.turns_used, int)
assert result.turns_used > 0
assert isinstance(result.finished_naturally, bool)
assert isinstance(result.tool_errors, list)
assert isinstance(result.reasoning_per_turn, list)
# Messages should follow OpenAI format
for msg in result.messages:
assert "role" in msg, f"Message missing 'role': {msg}"
assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}"
return result
await _try_models(_run)
@pytest.mark.asyncio
async def test_conversation_history_preserved():
"""The full conversation history should be in result.messages."""
async def _run(server, model):
agent = HermesAgentLoop(
server=server,
tool_schemas=[WEATHER_TOOL],
valid_tool_names={"get_weather"},
max_turns=5,
temperature=0.0,
max_tokens=500,
)
messages = [
{"role": "system", "content": "You are a helpful weather assistant."},
{"role": "user", "content": "What's the weather in Berlin? Use get_weather."},
]
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
result = await agent.run(messages)
# System message should be preserved
assert result.messages[0]["role"] == "system"
assert "weather assistant" in result.messages[0]["content"]
# User message should be preserved
assert result.messages[1]["role"] == "user"
assert "Berlin" in result.messages[1]["content"]
# Should have assistant + tool + assistant sequence
roles = [m["role"] for m in result.messages]
assert "tool" in roles, "Should have tool results in conversation"
return result
await _try_models(_run)