| """Integration tests for HermesAgentLoop tool calling. |
| |
| Tests the full agent loop with real LLM calls via OpenRouter. |
| Uses stepfun/step-3.5-flash:free by default (zero cost), falls back |
| to anthropic/claude-sonnet-4 if the free model is unavailable. |
| |
| These tests verify: |
| 1. Single tool call: model calls a tool, gets result, responds |
| 2. Multi-tool call: model calls multiple tools in one turn |
| 3. Multi-turn: model calls tools across multiple turns |
| 4. Unknown tool rejection: model calling a non-existent tool gets an error |
| 5. Max turns: loop stops when max_turns is reached |
| 6. No tools: model responds without calling any tools |
| 7. Tool error handling: tool execution errors are captured |
| |
| Run: |
| pytest tests/test_agent_loop_tool_calling.py -v |
| pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test |
| """ |
|
|
| import asyncio |
| import json |
| import os |
| import sys |
| from pathlib import Path |
| from typing import Any, Dict, List, Set |
| from unittest.mock import patch |
|
|
| import pytest |
|
|
| pytestmark = pytest.mark.skip(reason="Live API integration test — hangs in batch runs") |
|
|
| |
| _repo_root = Path(__file__).resolve().parent.parent |
| if str(_repo_root) not in sys.path: |
| sys.path.insert(0, str(_repo_root)) |
|
|
| try: |
| from environments.agent_loop import AgentResult, HermesAgentLoop |
| from atroposlib.envs.server_handling.openai_server import OpenAIServer |
| except ImportError: |
| pytest.skip("atroposlib not installed", allow_module_level=True) |
|
|
|
|
| |
| |
| |
|
|
| |
| _MODELS = [ |
| "stepfun/step-3.5-flash:free", |
| "google/gemini-2.0-flash-001", |
| "anthropic/claude-sonnet-4", |
| ] |
|
|
| def _get_api_key(): |
| key = os.getenv("OPENROUTER_API_KEY", "") |
| if not key: |
| pytest.skip("OPENROUTER_API_KEY not set") |
| return key |
|
|
|
|
| def _make_server(model: str = None): |
| """Create an OpenAI server for testing.""" |
| from atroposlib.envs.server_handling.openai_server import OpenAIServer |
| from atroposlib.envs.server_handling.server_manager import APIServerConfig |
|
|
| config = APIServerConfig( |
| base_url="https://openrouter.ai/api/v1", |
| model_name=model or _MODELS[0], |
| server_type="openai", |
| api_key=_get_api_key(), |
| health_check=False, |
| ) |
| return OpenAIServer(config) |
|
|
|
|
| async def _try_models(test_fn): |
| """Try running a test with each model until one works.""" |
| last_error = None |
| for model in _MODELS: |
| try: |
| server = _make_server(model) |
| return await test_fn(server, model) |
| except Exception as e: |
| last_error = e |
| if "rate" in str(e).lower() or "limit" in str(e).lower(): |
| continue |
| raise |
| pytest.skip(f"All models failed. Last error: {last_error}") |
|
|
|
|
| |
| |
| |
|
|
| |
| CALC_TOOL = { |
| "type": "function", |
| "function": { |
| "name": "calculate", |
| "description": "Calculate a math expression. Returns the numeric result.", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "expression": { |
| "type": "string", |
| "description": "Math expression to evaluate, e.g. '2 + 3'" |
| } |
| }, |
| "required": ["expression"], |
| }, |
| }, |
| } |
|
|
| |
| WEATHER_TOOL = { |
| "type": "function", |
| "function": { |
| "name": "get_weather", |
| "description": "Get the current weather for a city. Returns temperature and conditions.", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "city": { |
| "type": "string", |
| "description": "City name, e.g. 'Tokyo'" |
| } |
| }, |
| "required": ["city"], |
| }, |
| }, |
| } |
|
|
| |
| LOOKUP_TOOL = { |
| "type": "function", |
| "function": { |
| "name": "lookup", |
| "description": "Look up a fact. Returns a short answer string.", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "query": { |
| "type": "string", |
| "description": "What to look up" |
| } |
| }, |
| "required": ["query"], |
| }, |
| }, |
| } |
|
|
| |
| ERROR_TOOL = { |
| "type": "function", |
| "function": { |
| "name": "failing_tool", |
| "description": "A tool that always fails with an error.", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "input": {"type": "string"} |
| }, |
| "required": ["input"], |
| }, |
| }, |
| } |
|
|
|
|
| def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str: |
| """Handle fake tool calls for testing.""" |
| if tool_name == "calculate": |
| expr = args.get("expression", "0") |
| try: |
| |
| result = eval(expr, {"__builtins__": {}}, {}) |
| return json.dumps({"result": result}) |
| except Exception as e: |
| return json.dumps({"error": str(e)}) |
|
|
| elif tool_name == "get_weather": |
| city = args.get("city", "Unknown") |
| |
| return json.dumps({ |
| "city": city, |
| "temperature": 22, |
| "conditions": "sunny", |
| "humidity": 45, |
| }) |
|
|
| elif tool_name == "lookup": |
| query = args.get("query", "") |
| return json.dumps({"answer": f"The answer to '{query}' is 42."}) |
|
|
| elif tool_name == "failing_tool": |
| raise RuntimeError("This tool always fails!") |
|
|
| return json.dumps({"error": f"Unknown tool: {tool_name}"}) |
|
|
|
|
| |
| |
| |
|
|
| @pytest.mark.asyncio |
| async def test_single_tool_call(): |
| """Model should call a single tool, get the result, and respond.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[WEATHER_TOOL], |
| valid_tool_names={"get_weather"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| assert isinstance(result, AgentResult) |
| assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}" |
|
|
| |
| tool_calls_found = False |
| for msg in result.messages: |
| if msg.get("role") == "assistant" and msg.get("tool_calls"): |
| for tc in msg["tool_calls"]: |
| if tc["function"]["name"] == "get_weather": |
| tool_calls_found = True |
| args = json.loads(tc["function"]["arguments"]) |
| assert "city" in args |
| assert tool_calls_found, "Model should have called get_weather" |
|
|
| |
| tool_results = [m for m in result.messages if m.get("role") == "tool"] |
| assert len(tool_results) >= 1, "Should have at least one tool result" |
|
|
| |
| final_msg = result.messages[-1] |
| assert final_msg["role"] == "assistant" |
| assert final_msg["content"], "Final response should have content" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_multi_tool_single_turn(): |
| """Model should call multiple tools in a single turn.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[WEATHER_TOOL, CALC_TOOL], |
| valid_tool_names={"get_weather", "calculate"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": ( |
| "I need two things at once: " |
| "1) What's the weather in Paris? Use get_weather. " |
| "2) What is 15 * 7? Use calculate. " |
| "Call BOTH tools in a single response." |
| )}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| tools_called = set() |
| for msg in result.messages: |
| if msg.get("role") == "assistant" and msg.get("tool_calls"): |
| for tc in msg["tool_calls"]: |
| tools_called.add(tc["function"]["name"]) |
|
|
| |
| assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}" |
| assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_multi_turn_conversation(): |
| """Agent should handle multiple turns of tool calls.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[LOOKUP_TOOL, CALC_TOOL], |
| valid_tool_names={"lookup", "calculate"}, |
| max_turns=10, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": ( |
| "First, use the lookup tool to look up 'meaning of life'. " |
| "Then use calculate to compute 6 * 7. " |
| "Do these in separate tool calls, one at a time." |
| )}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| tools_called = set() |
| for msg in result.messages: |
| if msg.get("role") == "assistant" and msg.get("tool_calls"): |
| for tc in msg["tool_calls"]: |
| tools_called.add(tc["function"]["name"]) |
|
|
| assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}" |
| assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" |
|
|
| |
| assert result.finished_naturally, "Should finish naturally after answering" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_unknown_tool_rejected(): |
| """If the model calls a tool not in valid_tool_names, it gets an error.""" |
|
|
| async def _run(server, model): |
| |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[CALC_TOOL, WEATHER_TOOL], |
| valid_tool_names={"calculate"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": "What's the weather in London? Use get_weather."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| if result.tool_errors: |
| weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"] |
| assert len(weather_errors) > 0, "get_weather should have been rejected" |
| assert "Unknown tool" in weather_errors[0].error |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_max_turns_limit(): |
| """Agent should stop after max_turns even if model keeps calling tools.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[LOOKUP_TOOL], |
| valid_tool_names={"lookup"}, |
| max_turns=2, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": ( |
| "Keep looking up facts. Look up 'fact 1', then 'fact 2', " |
| "then 'fact 3', then 'fact 4'. Do them one at a time." |
| )}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}" |
| assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_no_tools_direct_response(): |
| """When no tools are useful, model should respond directly.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[WEATHER_TOOL], |
| valid_tool_names={"get_weather"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=200, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| assert result.finished_naturally, "Should finish naturally with a direct response" |
| assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}" |
|
|
| final = result.messages[-1] |
| assert final["role"] == "assistant" |
| assert final["content"], "Should have text content" |
| assert "4" in final["content"], "Should contain the answer '4'" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_tool_error_handling(): |
| """Tool execution errors should be captured and reported to the model.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[ERROR_TOOL], |
| valid_tool_names={"failing_tool"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": "Please call the failing_tool with input 'test'."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| assert len(result.tool_errors) >= 1, "Should have at least one tool error" |
| assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error |
|
|
| |
| tool_results = [m for m in result.messages if m.get("role") == "tool"] |
| assert len(tool_results) >= 1 |
| error_result = json.loads(tool_results[0]["content"]) |
| assert "error" in error_result |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_agent_result_structure(): |
| """Verify the AgentResult has all expected fields populated.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[CALC_TOOL], |
| valid_tool_names={"calculate"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=300, |
| ) |
|
|
| messages = [ |
| {"role": "user", "content": "What is 3 + 4? Use the calculate tool."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| assert isinstance(result, AgentResult) |
| assert isinstance(result.messages, list) |
| assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)" |
| assert isinstance(result.turns_used, int) |
| assert result.turns_used > 0 |
| assert isinstance(result.finished_naturally, bool) |
| assert isinstance(result.tool_errors, list) |
| assert isinstance(result.reasoning_per_turn, list) |
|
|
| |
| for msg in result.messages: |
| assert "role" in msg, f"Message missing 'role': {msg}" |
| assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_conversation_history_preserved(): |
| """The full conversation history should be in result.messages.""" |
|
|
| async def _run(server, model): |
| agent = HermesAgentLoop( |
| server=server, |
| tool_schemas=[WEATHER_TOOL], |
| valid_tool_names={"get_weather"}, |
| max_turns=5, |
| temperature=0.0, |
| max_tokens=500, |
| ) |
|
|
| messages = [ |
| {"role": "system", "content": "You are a helpful weather assistant."}, |
| {"role": "user", "content": "What's the weather in Berlin? Use get_weather."}, |
| ] |
|
|
| with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): |
| result = await agent.run(messages) |
|
|
| |
| assert result.messages[0]["role"] == "system" |
| assert "weather assistant" in result.messages[0]["content"] |
|
|
| |
| assert result.messages[1]["role"] == "user" |
| assert "Berlin" in result.messages[1]["content"] |
|
|
| |
| roles = [m["role"] for m in result.messages] |
| assert "tool" in roles, "Should have tool results in conversation" |
|
|
| return result |
|
|
| await _try_models(_run) |
|
|