hermes / rl_cli.py
lenson78's picture
initial upload: v2026.3.23 with HF Spaces deployment
9aa5185 verified
#!/usr/bin/env python3
"""
RL Training CLI Runner
Dedicated CLI runner for RL training workflows with:
- Extended timeouts for long-running training
- RL-focused system prompts
- Full toolset including RL training tools
- Special handling for 30-minute check intervals
Usage:
python rl_cli.py "Train a model on GSM8k for math reasoning"
python rl_cli.py --interactive
python rl_cli.py --list-environments
Environment Variables:
TINKER_API_KEY: API key for Tinker service (required)
WANDB_API_KEY: API key for WandB metrics (required)
OPENROUTER_API_KEY: API key for OpenRouter (required for agent)
"""
import asyncio
import os
import sys
from pathlib import Path
import fire
import yaml
# Load .env from ~/.hermes/.env first, then project root as dev fallback.
# User-managed env files should override stale shell exports on restart.
_hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
_project_env = Path(__file__).parent / '.env'
from hermes_cli.env_loader import load_hermes_dotenv
_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
for _env_path in _loaded_env_paths:
print(f"βœ… Loaded environment variables from {_env_path}")
# Set terminal working directory to tinker-atropos submodule
# This ensures terminal commands run in the right context for RL work
tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos'
if tinker_atropos_dir.exists():
os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir)
os.environ['HERMES_QUIET'] = '1' # Disable temp subdirectory creation
print(f"πŸ“‚ Terminal working directory: {tinker_atropos_dir}")
else:
# Fall back to hermes-agent directory if submodule not found
os.environ['TERMINAL_CWD'] = str(Path(__file__).parent)
os.environ['HERMES_QUIET'] = '1'
print(f"⚠️ tinker-atropos submodule not found, using: {Path(__file__).parent}")
# Import agent and tools
from run_agent import AIAgent
from model_tools import get_tool_definitions, check_toolset_requirements
from tools.rl_training_tool import check_rl_api_keys, get_missing_keys
# ============================================================================
# Config Loading
# ============================================================================
from hermes_constants import OPENROUTER_BASE_URL
DEFAULT_MODEL = "anthropic/claude-opus-4.5"
DEFAULT_BASE_URL = OPENROUTER_BASE_URL
def load_hermes_config() -> dict:
"""
Load configuration from ~/.hermes/config.yaml.
Returns:
dict: Configuration with model, base_url, etc.
"""
config_path = _hermes_home / 'config.yaml'
config = {
"model": DEFAULT_MODEL,
"base_url": DEFAULT_BASE_URL,
}
if config_path.exists():
try:
with open(config_path, "r") as f:
file_config = yaml.safe_load(f) or {}
# Get model from config
if "model" in file_config:
if isinstance(file_config["model"], str):
config["model"] = file_config["model"]
elif isinstance(file_config["model"], dict):
config["model"] = file_config["model"].get("default", DEFAULT_MODEL)
# Get base_url if specified
if "base_url" in file_config:
config["base_url"] = file_config["base_url"]
except Exception as e:
print(f"⚠️ Warning: Failed to load config.yaml: {e}")
return config
# ============================================================================
# RL-Specific Configuration
# ============================================================================
# Extended timeouts for long-running RL operations
RL_MAX_ITERATIONS = 200 # Allow many more iterations for long workflows
# RL-focused system prompt
RL_SYSTEM_PROMPT = """You are an automated post-training engineer specializing in reinforcement learning for language models.
## Your Capabilities
You have access to RL training tools for running reinforcement learning on models through Tinker-Atropos:
1. **DISCOVER**: Use `rl_list_environments` to see available RL environments
2. **INSPECT**: Read environment files to understand how they work (verifiers, data loading, rewards)
3. **INSPECT DATA**: Use terminal to explore HuggingFace datasets and understand their format
4. **CREATE**: Copy existing environments as templates, modify for your needs
5. **CONFIGURE**: Use `rl_select_environment` and `rl_edit_config` to set up training
6. **TEST**: Always use `rl_test_inference` before full training to validate your setup
7. **TRAIN**: Use `rl_start_training` to begin, `rl_check_status` to monitor
8. **EVALUATE**: Use `rl_get_results` and analyze WandB metrics to assess performance
## Environment Files
Environment files are located in: `tinker-atropos/tinker_atropos/environments/`
Study existing environments to learn patterns. Look for:
- `load_dataset()` calls - how data is loaded
- `score_answer()` / `score()` - verification logic
- `get_next_item()` - prompt formatting
- `system_prompt` - instruction format
- `config_init()` - default configuration
## Creating New Environments
To create a new environment:
1. Read an existing environment file (e.g., gsm8k_tinker.py)
2. Use terminal to explore the target dataset format
3. Copy the environment file as a template
4. Modify the dataset loading, prompt formatting, and verifier logic
5. Test with `rl_test_inference` before training
## Important Guidelines
- **Always test before training**: Training runs take hours - verify everything works first
- **Monitor metrics**: Check WandB for reward/mean and percent_correct
- **Status check intervals**: Wait at least 30 minutes between status checks
- **Early stopping**: Stop training early if metrics look bad or stagnant
- **Iterate quickly**: Start with small total_steps to validate, then scale up
## Available Toolsets
You have access to:
- **RL tools**: Environment discovery, config management, training, testing
- **Terminal**: Run commands, inspect files, explore datasets
- **Web**: Search for information, documentation, papers
- **File tools**: Read and modify code files
When asked to train a model, follow this workflow:
1. List available environments
2. Select and configure the appropriate environment
3. Test with sample prompts
4. Start training with conservative settings
5. Monitor progress and adjust as needed
"""
# Toolsets to enable for RL workflows
RL_TOOLSETS = ["terminal", "web", "rl"]
# ============================================================================
# Helper Functions
# ============================================================================
def check_requirements():
"""Check that all required environment variables and services are available."""
errors = []
# Check API keys
if not os.getenv("OPENROUTER_API_KEY"):
errors.append("OPENROUTER_API_KEY not set - required for agent")
missing_rl_keys = get_missing_keys()
if missing_rl_keys:
errors.append(f"Missing RL API keys: {', '.join(missing_rl_keys)}")
if errors:
print("❌ Missing requirements:")
for error in errors:
print(f" - {error}")
print("\nPlease set these environment variables in your .env file or shell.")
return False
return True
def check_tinker_atropos():
"""Check if tinker-atropos submodule is properly set up."""
tinker_path = Path(__file__).parent / "tinker-atropos"
if not tinker_path.exists():
return False, "tinker-atropos submodule not found. Run: git submodule update --init"
envs_path = tinker_path / "tinker_atropos" / "environments"
if not envs_path.exists():
return False, f"environments directory not found at {envs_path}"
env_files = list(envs_path.glob("*.py"))
env_files = [f for f in env_files if not f.name.startswith("_")]
return True, {"path": str(tinker_path), "environments_count": len(env_files)}
def list_environments_sync():
"""List available environments (synchronous wrapper)."""
from tools.rl_training_tool import rl_list_environments
import json
async def _list():
result = await rl_list_environments()
return json.loads(result)
return asyncio.run(_list())
# ============================================================================
# Main CLI
# ============================================================================
def main(
task: str = None,
model: str = None,
api_key: str = None,
base_url: str = None,
max_iterations: int = RL_MAX_ITERATIONS,
interactive: bool = False,
list_environments: bool = False,
check_server: bool = False,
verbose: bool = False,
save_trajectories: bool = True,
):
"""
RL Training CLI - Dedicated runner for RL training workflows.
Args:
task: The training task/goal (e.g., "Train a model on GSM8k for math")
model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided)
api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided)
base_url: API base URL (reads from config or defaults to OpenRouter)
max_iterations: Maximum agent iterations (default: 200 for long workflows)
interactive: Run in interactive mode (multiple conversations)
list_environments: Just list available RL environments and exit
check_server: Check if RL API server is running and exit
verbose: Enable verbose logging
save_trajectories: Save conversation trajectories (default: True for RL)
Examples:
# Train on a specific environment
python rl_cli.py "Train a model on GSM8k math problems"
# Interactive mode
python rl_cli.py --interactive
# List available environments
python rl_cli.py --list-environments
# Check server status
python rl_cli.py --check-server
"""
# Load config from ~/.hermes/config.yaml
config = load_hermes_config()
# Use config values if not explicitly provided
if model is None:
model = config["model"]
if base_url is None:
base_url = config["base_url"]
print("🎯 RL Training Agent")
print("=" * 60)
# Handle setup check
if check_server:
print("\nπŸ” Checking tinker-atropos setup...")
ok, result = check_tinker_atropos()
if ok:
print("βœ… tinker-atropos submodule found")
print(f" Path: {result.get('path')}")
print(f" Environments found: {result.get('environments_count', 0)}")
# Also check API keys
missing = get_missing_keys()
if missing:
print(f"\n⚠️ Missing API keys: {', '.join(missing)}")
print(" Add them to ~/.hermes/.env")
else:
print("βœ… API keys configured")
else:
print(f"❌ tinker-atropos not set up: {result}")
print("\nTo set up:")
print(" git submodule update --init")
print(" pip install -e ./tinker-atropos")
return
# Handle environment listing
if list_environments:
print("\nπŸ“‹ Available RL Environments:")
print("-" * 40)
try:
data = list_environments_sync()
if "error" in data:
print(f"❌ Error: {data['error']}")
return
envs = data.get("environments", [])
if not envs:
print("No environments found.")
print("\nMake sure tinker-atropos is set up:")
print(" git submodule update --init")
return
for env in envs:
print(f"\n πŸ“¦ {env['name']}")
print(f" Class: {env['class_name']}")
print(f" Path: {env['file_path']}")
if env.get('description'):
desc = env['description'][:100] + "..." if len(env.get('description', '')) > 100 else env.get('description', '')
print(f" Description: {desc}")
print(f"\nπŸ“Š Total: {len(envs)} environments")
print("\nUse `rl_select_environment(name)` to select an environment for training.")
except Exception as e:
print(f"❌ Error listing environments: {e}")
print("\nMake sure tinker-atropos is set up:")
print(" git submodule update --init")
print(" pip install -e ./tinker-atropos")
return
# Check requirements
if not check_requirements():
sys.exit(1)
# Set default task if none provided
if not task and not interactive:
print("\n⚠️ No task provided. Use --interactive for interactive mode or provide a task.")
print("\nExamples:")
print(' python rl_cli.py "Train a model on GSM8k math problems"')
print(' python rl_cli.py "Create an RL environment for code generation"')
print(' python rl_cli.py --interactive')
return
# Get API key
api_key = api_key or os.getenv("OPENROUTER_API_KEY")
if not api_key:
print("❌ No API key provided. Set OPENROUTER_API_KEY or pass --api-key")
sys.exit(1)
print(f"\nπŸ€– Model: {model}")
print(f"πŸ”§ Max iterations: {max_iterations}")
print(f"πŸ“ Toolsets: {', '.join(RL_TOOLSETS)}")
print("=" * 60)
# Create agent with RL configuration
agent = AIAgent(
base_url=base_url,
api_key=api_key,
model=model,
max_iterations=max_iterations,
enabled_toolsets=RL_TOOLSETS,
save_trajectories=save_trajectories,
verbose_logging=verbose,
quiet_mode=False,
ephemeral_system_prompt=RL_SYSTEM_PROMPT,
)
if interactive:
# Interactive mode - multiple conversations
print("\nπŸ”„ Interactive RL Training Mode")
print("Type 'quit' or 'exit' to end the session.")
print("Type 'status' to check active training runs.")
print("-" * 40)
while True:
try:
user_input = input("\n🎯 RL Task> ").strip()
if not user_input:
continue
if user_input.lower() in ('quit', 'exit', 'q'):
print("\nπŸ‘‹ Goodbye!")
break
if user_input.lower() == 'status':
# Quick status check
from tools.rl_training_tool import rl_list_runs
import json
result = asyncio.run(rl_list_runs())
runs = json.loads(result)
if isinstance(runs, list) and runs:
print("\nπŸ“Š Active Runs:")
for run in runs:
print(f" - {run['run_id']}: {run['environment']} ({run['status']})")
else:
print("\nNo active runs.")
continue
# Run the agent
print("\n" + "=" * 60)
response = agent.run_conversation(user_input)
print("\n" + "=" * 60)
except KeyboardInterrupt:
print("\n\nπŸ‘‹ Interrupted. Goodbye!")
break
except Exception as e:
print(f"\n❌ Error: {e}")
if verbose:
import traceback
traceback.print_exc()
else:
# Single task mode
print(f"\nπŸ“ Task: {task}")
print("-" * 40)
try:
response = agent.run_conversation(task)
print("\n" + "=" * 60)
print("βœ… Task completed")
except KeyboardInterrupt:
print("\n\n⚠️ Interrupted by user")
except Exception as e:
print(f"\n❌ Error: {e}")
if verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
fire.Fire(main)