#!/usr/bin/env python3 """ Script to generate data.json from training log files. This script parses log files from the runs directory and extracts: - Training metrics (loss, learning rate, inf/nan count) - Evaluation results (paloma metrics) - Model configuration parameters The output is saved to plots/data.json for the dashboard. """ import json import re from pathlib import Path from typing import Any, Dict, List, Optional def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]: """Parse training metrics from log content.""" metrics = [] # Pattern to match training metrics entries with timestamp and log level pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- šŸ”„ Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ā”œā”€ā”€ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ā”œā”€ā”€ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)" matches = re.findall(pattern, log_content) for step, loss, lr, inf_nan in matches: metrics.append( { "step": int(step), "loss": float(loss), "learning_rate": float(lr), "inf_nan_count": int(inf_nan), } ) return sorted(metrics, key=lambda x: x["step"]) def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]: """Parse evaluation results from log content.""" results = [] # Pattern to match evaluation results with timestamp and log level pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- šŸ“Š Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)" matches = re.findall(pattern, log_content) for step, paloma in matches: try: paloma_value = float(paloma) results.append({"step": int(step), "paloma": paloma_value}) except ValueError: # Skip if paloma value is not a valid number (e.g., "inf") continue return sorted(results, key=lambda x: x["step"]) def extract_config_from_log(log_content: str) -> Dict[str, Any]: """Extract model configuration from log content.""" config = {} # Extract key model parameters patterns = { "d_model": r"d_model: (\d+)", "n_layers": r"n_layers: (\d+)", "max_seq_len": r"max_seq_len: (\d+)", "vocab_size": r"vocab_size: (\d+)", "lr": r"lr: ([\d.e+-]+)", "max_steps": r"max_steps: (\d+)", "batch_size": r"batch_size: (\d+)", } for key, pattern in patterns.items(): match = re.search(pattern, log_content) if match: try: if key in [ "d_model", "n_layers", "max_seq_len", "vocab_size", "max_steps", "batch_size", ]: config[key] = int(match.group(1)) else: config[key] = float(match.group(1)) except ValueError: continue return config def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]: """Process a single run directory and extract all data.""" run_name = run_path.name # Find log files logs_dir = run_path / "logs" if not logs_dir.exists(): return None log_files = list(logs_dir.glob("*.log")) if not log_files: return None # Use the most recent log file for configuration latest_log = max(log_files, key=lambda x: x.stat().st_mtime) # Read log content log_content = latest_log.read_text(encoding="utf-8") # Extract data training_metrics = parse_training_metrics(log_content) evaluation_results = parse_evaluation_results(log_content) config = extract_config_from_log(log_content) # If no training metrics found, skip this run if not training_metrics: return None return { "run_name": run_name, "log_file": latest_log.name, "training_metrics": training_metrics, "evaluation_results": evaluation_results, "config": config, } def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"): """Generate data.json from all run directories.""" runs_path = Path(runs_dir) if not runs_path.exists(): print(f"Runs directory {runs_dir} not found!") return runs_data = [] # Process each run directory for run_dir in runs_path.iterdir(): if run_dir.is_dir(): print(f"Processing run: {run_dir.name}") run_data = process_run_directory(run_dir) if run_data: runs_data.append(run_data) print(f" āœ“ Found {len(run_data['training_metrics'])} training metrics") print( f" āœ“ Found {len(run_data['evaluation_results'])} evaluation results" ) else: print(" āœ— No valid data found") if not runs_data: print("No valid runs found!") return # Create output data structure output_data = { "runs": runs_data, "summary": { "total_runs": len(runs_data), "run_names": [run["run_name"] for run in runs_data], }, } # Ensure output directory exists output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Write to file with open(output_path, "w", encoding="utf-8") as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"\nāœ“ Generated {output_file} with {len(runs_data)} runs") print( f"āœ“ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}" ) print( f"āœ“ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}" ) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Generate data.json from training logs" ) parser.add_argument("--runs-dir", default="runs", help="Path to runs directory") parser.add_argument("--output", default="plots/data.json", help="Output file path") args = parser.parse_args() generate_data_json(args.runs_dir, args.output)