ben-bot / tests /integration /spec-001.py
byoung-hf's picture
Upload folder using huggingface_hub
63256f9 verified
"""
Integration tests for ai-me agent.
Tests the complete setup including vectorstore, agent configuration, and agent responses.
"""
import pytest
import pytest_asyncio
import re
import sys
import os
import logging
from datetime import datetime
from unittest.mock import AsyncMock, patch
# Something about these tests makes me feel yucky. Big, brittle, and slow. BBS?
# In the future we should run inference locally with docker-compose models.
# Set temperature and seed for deterministic test results
os.environ["TEMPERATURE"] = "0"
os.environ["SEED"] = "42"
# Point our RAG to the tests/data directory
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
test_data_dir = os.path.join(project_root, "tests", "data")
os.environ["DOC_ROOT"] = test_data_dir
os.environ["LOCAL_DOCS"] = "**/*.md"
from config import setup_logger, Config
from agent import AIMeAgent
from data import DataManager, DataManagerConfig
logger = setup_logger(__name__)
# ============================================================================
# SHARED CACHING - Initialize on first use, then reuse
# ============================================================================
_config = None
_vectorstore = None
_data_manager = None
def _get_shared_config():
"""Lazy initialization of shared config."""
global _config
if _config is None:
_config = Config() # type: ignore
logger.info(f"Initialized shared config: {_config.bot_full_name}")
return _config
def _get_shared_vectorstore():
"""Lazy initialization of shared vectorstore."""
global _vectorstore, _data_manager
if _vectorstore is None:
logger.info("Initializing shared vectorstore (first test)...")
test_data_dir = os.path.join(project_root, "tests", "data")
_data_config = DataManagerConfig(
doc_root=test_data_dir
)
_data_manager = DataManager(config=_data_config)
_vectorstore = _data_manager.setup_vectorstore()
logger.info(f"Shared vectorstore ready: {_vectorstore._collection.count()} documents")
return _vectorstore
@pytest_asyncio.fixture(scope="function")
async def ai_me_agent():
"""
Setup fixture for ai-me agent with vectorstore and MCP servers.
CRITICAL: Function-scoped fixture prevents hanging/blocking issues.
Each test gets its own agent instance with proper cleanup.
Reuses shared config and vectorstore (lazy-initialized on first use).
This fixture:
- Reuses shared config and vectorstore
- Creates agent WITH real subprocess MCP servers (GitHub, Time, Memory)
- Yields agent for test
- Cleans up MCP servers after test completes
"""
config = _get_shared_config()
vectorstore = _get_shared_vectorstore()
# Initialize agent config with shared vectorstore
aime_agent = AIMeAgent(
bot_full_name=config.bot_full_name,
model=config.model,
vectorstore=vectorstore,
github_token=config.github_token,
session_id="test-session"
)
# Create the agent WITH MCP servers enabled
logger.info("Creating ai-me agent with MCP servers...")
assert aime_agent.session_id is not None, "session_id should be set"
await aime_agent.create_ai_me_agent(
mcp_params=[
aime_agent.mcp_github_params,
aime_agent.mcp_time_params,
aime_agent.get_mcp_memory_params(aime_agent.session_id),
]
)
logger.info("Agent created successfully with MCP servers")
logger.info(f"Temperature set to {config.temperature}")
logger.info(f"Seed set to {config.seed}")
# Yield the agent for the test
yield aime_agent
# CRITICAL: Cleanup after test completes to prevent hanging
logger.info("Cleaning up MCP servers after test...")
await aime_agent.cleanup()
logger.info("Cleanup complete")
@pytest.mark.asyncio
async def test_github_documents_load():
"""Tests FR-002: GitHub document loading with source metadata."""
config = Config() # type: ignore
# Load GitHub documents directly
github_config = DataManagerConfig(
doc_load_local=[]
)
dm = DataManager(config=github_config)
vs = dm.setup_vectorstore(github_repos=["byoung/ai-me"])
agent = AIMeAgent(
bot_full_name=config.bot_full_name,
model=config.model,
vectorstore=vs,
github_token=config.github_token,
session_id="test-session"
)
await agent.create_ai_me_agent()
response = await agent.run("Do you have python experience?")
assert "yes" in response.lower(), (
f"yes' in response but got: {response}"
)
@pytest.mark.asyncio
async def test_rear_knowledge_contains_it245(ai_me_agent):
"""Tests REQ-001: Knowledge base retrieval of personal documentation."""
response = await ai_me_agent.run("What is IT-245?")
assert "IT-245" in response or "It-245" in response or "it-245" in response
logger.info("✓ IT-245 found in response")
@pytest.mark.asyncio
async def test_github_commits_contains_shas(ai_me_agent):
"""Tests REQ-002: MCP GitHub integration - retrieve commit history."""
response = await ai_me_agent.run("What are some recent commits I've made?")
assert response, "Response is empty"
assert len(response) > 10, "Response is too short"
logger.info("✓ Response contains commit information")
@pytest.mark.asyncio
async def test_unknown_person_contains_negative_response(ai_me_agent):
"""Tests REQ-003: Graceful handling of out-of-scope requests."""
response = await ai_me_agent.run(
"Do you know Slartibartfast?" # Presumed unknown person
)
assert response, "Response is empty"
assert (
"don't know" in response.lower()
or "not familiar" in response.lower()
or "no information" in response.lower()
or "don't have any information" in response.lower()
), f"Response doesn't indicate lack of knowledge: {response}"
logger.info(f"✓ Test passed - correctly handled out-of-scope query")
@pytest.mark.asyncio
async def test_carol_knowledge_contains_product(ai_me_agent):
"""Tests FR-002, FR-003: Verify asking about Carol returns 'product'."""
response_raw = await ai_me_agent.run("Do you know Carol?")
response = response_raw.lower() # Convert to lowercase for matching
# Assert that 'product' appears in the response (Carol is Product Owner)
assert "product" in response, (
f"Expected 'product' in response but got: {response}"
)
logger.info("✓ Test passed: Response contains 'product'")
@pytest.mark.asyncio
async def test_mcp_time_server_returns_current_date(ai_me_agent):
"""Tests FR-009, NFR-001: Verify that the MCP time server returns the current date."""
response = await ai_me_agent.run("What is today's date?")
# Check for current date in various formats (ISO or natural language)
now = datetime.now()
expected_date, current_year, current_month, current_day = (
now.strftime("%Y-%m-%d"),
str(now.year),
now.strftime("%B"),
str(now.day),
)
# Accept either ISO format or natural language date
has_date = (
expected_date in response
or (
current_year in response
and current_month in response
and current_day in response
)
)
assert has_date, (
f"Expected response to contain current date "
f"({expected_date} or {current_month} {current_day}, {current_year}) "
f"but got: {response}"
)
logger.info(f"✓ Test passed: Response contains current date")
@pytest.mark.asyncio
async def test_mcp_memory_server_remembers_favorite_color(ai_me_agent):
"""Tests FR-013, NFR-002:
Verify that the MCP memory server persists information across interactions.
"""
await ai_me_agent.run("My favorite color is chartreuse.")
response2 = await ai_me_agent.run("What's my favorite color?")
# Check that the agent remembers the color
assert "chartreuse" in response2.lower(), (
f"Expected agent to remember favorite color 'chartreuse' "
f"but got: {response2}"
)
msg = (
"✓ Test passed: Agent remembered favorite color 'chartreuse' "
"across interactions"
)
logger.info(msg)
@pytest.mark.asyncio
async def test_github_relative_links_converted_to_absolute_urls():
"""Tests FR-004: Document processing converts relative GitHub links to absolute URLs.
Validates that when documents are loaded from GitHub with relative links
(e.g., /resume.md), they are rewritten to full GitHub URLs
(e.g., https://github.com/owner/repo/blob/main/resume.md).
This is a unit-level test of the DataManager.process_documents() method.
"""
from langchain_core.documents import Document
sample_doc = Document(
page_content=(
"Check out [my resume](/resume.md) and "
"[projects](/projects.md) for more info."
),
metadata={
"source": "github://byoung/ai-me/docs/about.md",
"github_repo": "byoung/ai-me"
}
)
# Verify metadata is set correctly before processing
assert sample_doc.metadata["github_repo"] == "byoung/ai-me", (
"Sample doc metadata should have github_repo"
)
data_config = DataManagerConfig()
data_manager = DataManager(config=data_config)
processed_docs = data_manager.process_documents([sample_doc])
assert len(processed_docs) == 1, "Expected 1 processed document"
processed_content = processed_docs[0].page_content
# Check that relative links have been converted to absolute GitHub URLs
assert "https://github.com/byoung/ai-me/blob/main/resume.md" in processed_content, (
f"Expected absolute GitHub URL for /resume.md in processed content, "
f"but got: {processed_content}"
)
assert "https://github.com/byoung/ai-me/blob/main/projects.md" in processed_content, (
f"Expected absolute GitHub URL for /projects.md in processed content, "
f"but got: {processed_content}"
)
logger.info("✓ Test passed: Relative GitHub links converted to absolute URLs")
logger.info(f" Original: [my resume](/resume.md)")
logger.info(f" Converted: [my resume](https://github.com/byoung/ai-me/blob/main/resume.md)")
@pytest.mark.asyncio
async def test_agent_responses_cite_sources(ai_me_agent):
"""Tests FR-004, FR-011: Agent responses include source citations.
Validates that agent responses include proper source attribution,
which could be GitHub URLs, local paths, or explicit source references.
"""
questions = [
"What do you know about ReaR?",
"Tell me about your experience in technology",
]
for question in questions:
logger.info(f"\n{'='*60}\nSource citation test: {question}\n{'='*60}")
response = await ai_me_agent.run(question)
# Check that response includes some form of source attribution
# Could be: GitHub URL, local path, "Sources" section, etc.
has_source = (
"https://github.com/" in response or
".md" in response or # Local markdown file reference
"source" in response.lower() or
"documentation" in response.lower()
)
assert has_source, (
f"Expected source attribution in response to '{question}' "
f"but found none. Response: {response}"
)
# Verify response is substantive (not just metadata)
min_length = 50
assert len(response) > min_length, (
f"Response to '{question}' was too short: {response}"
)
logger.info(f"✓ Source citation found for: {question[:40]}...")
logger.info("\n✓ Test passed: Agent responses cite sources (FR-004, FR-011)")
@pytest.mark.asyncio
async def test_user_story_2_multi_topic_consistency(ai_me_agent):
"""
Tests FR-001, FR-003, FR-005, NFR-002: User Story 2 - Multi-Topic Consistency
Verify that the agent maintains consistent first-person perspective
across multiple conversation topics.
This tests that the agent:
- Uses first-person perspective (I, my, me) consistently
- Maintains professional tone across different topic switches
- Shows context awareness of different topics
- Remains in-character as the personified individual
"""
# Ask 3 questions about different topics
topics = [
("What is your background in technology?", "background|experience|technology"),
("What programming languages are you skilled in?", "programming|language|skilled"),
]
first_person_patterns = [
r"\bi\b", r"\bme\b", r"\bmy\b", r"\bmyself\b",
r"\bI['m]", r"\bI['ve]", r"\bI['ll]"
]
for question, topic_keywords in topics:
logger.info(f"\n{'='*60}\nMulti-topic test question: {question}\n{'='*60}")
response = await ai_me_agent.run(question)
response_lower = response.lower()
# Check for first-person usage
first_person_found = any(
re.search(pattern, response, re.IGNORECASE)
for pattern in first_person_patterns
)
assert first_person_found, (
f"Expected first-person perspective in response to '{question}' "
f"but got: {response}"
)
# Verify response is substantive (not just "I don't know")
min_length = 50 # Substantive responses should be > 50 chars
assert len(response) > min_length, (
f"Response to '{question}' was too short (likely not substantive): {response}"
)
logger.info(f"✓ First-person perspective maintained for: {question[:40]}...")
logger.info(f" Response preview: {response[:100]}...")
logger.info("\n✓ Test passed: Consistent first-person perspective across 3+ topics")
@pytest.mark.asyncio
async def test_tool_failure_error_messages_are_friendly(caplog, ai_me_agent):
"""
Tests FR-012, NFR-003: Error Message Quality (FR-012)
Verify that tool failures return user-friendly messages without Python tracebacks.
This tests that the agent:
- Returns human-readable error messages
- logs an error that can be reviewed in our dashboard/logs
Uses mocking to simulate tool failures without adding test-specific code to agent.py
"""
logger.info(f"\n{'='*60}\nError Handling Test\n{'='*60}")
# Mock the Runner.run method to simulate a tool failure
# This tests the catch-all exception handler without adding test code to production
test_scenarios = [
RuntimeError("Simulated tool timeout"),
ValueError("Invalid tool parameters"),
]
for error in test_scenarios:
logger.info(f"\nTesting error scenario: {error.__class__.__name__}: {error}")
# Clear previous log records for this iteration
caplog.clear()
# Mock Runner.run to raise an exception
with patch('agent.Runner.run', new_callable=AsyncMock) as mock_run:
mock_run.side_effect = error
response = await ai_me_agent.run("Any user question")
logger.info(f"Response: {response[:100]}...")
# PRIMARY CHECK: Verify "I encountered an unexpected error" is in response
assert "I encountered an unexpected error" in response, (
f"Response must contain 'I encountered an unexpected error'. Got: {response}"
)
# SECONDARY CHECK: Verify error was logged by agent.py
error_logs = [record for record in caplog.records if record.levelname == "ERROR"]
assert len(error_logs) > 0, "Expected at least one ERROR log record from agent.py"
# Find the agent.py error log (contains "Unexpected error:")
agent_error_logged = any(
"Unexpected error:" in record.message for record in error_logs
)
assert agent_error_logged, (
f"Expected ERROR log with 'Unexpected error:' from agent.py. "
f"Got: {[r.message for r in error_logs]}"
)
error_messages = [
r.message for r in error_logs
if "Unexpected error:" in r.message
]
logger.info(
f"✓ Error properly logged to logger: {error_messages}"
)
logger.info("\n✓ Test passed: Error messages are friendly (FR-012) + properly logged")
@pytest.mark.asyncio
async def test_logger_setup_format(caplog):
"""Tests NFR-003 (Structured Logging): Verify setup_logger creates structured logging.
Tests that setup_logger() configures syslog-style format with JSON support for
structured logging of user/agent interactions.
This validates the logger configuration that our production app relies on
for analytics and debugging.
"""
# Force logger setup to run by clearing handlers so setup_logger reconfigures
root_logger = logging.getLogger()
original_handlers = root_logger.handlers[:]
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
try:
# Now call setup_logger with no handlers - should trigger full setup
test_logger = setup_logger("test.structured_logging")
# Verify logger was created
assert test_logger.name == "test.structured_logging"
# Verify root logger now has handlers (setup_logger should have added them)
assert len(root_logger.handlers) > 0, (
"Root logger should have handlers after setup_logger"
)
# Verify we have a StreamHandler (console output)
has_stream_handler = any(
isinstance(handler, logging.StreamHandler)
for handler in root_logger.handlers
)
assert has_stream_handler, "Should have StreamHandler for console output"
# Test that logging works with structured JSON format
# The formatters should support JSON logging for analytics
test_logger.info(
'{"session_id": "test-session", "user_input": "test message"}'
)
logger.info(
"✓ Test passed: Logger setup configures structured logging (NFR-003)"
)
finally:
# Restore original handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
for handler in original_handlers:
root_logger.addHandler(handler)
if __name__ == "__main__":
# Allow running tests directly with python test.py
pytest.main([__file__, "-v", "-s"])