File size: 6,480 Bytes
feba2ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
#!/usr/bin/env python3
"""
Script to generate data.json from training log files.
This script parses log files from the runs directory and extracts:
- Training metrics (loss, learning rate, inf/nan count)
- Evaluation results (paloma metrics)
- Model configuration parameters
The output is saved to plots/data.json for the dashboard.
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
"""Parse training metrics from log content."""
metrics = []
# Pattern to match training metrics entries with timestamp and log level
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Inf/NaN count: (\d+)"
matches = re.findall(pattern, log_content)
for step, loss, lr, inf_nan in matches:
metrics.append(
{
"step": int(step),
"loss": float(loss),
"learning_rate": float(lr),
"inf_nan_count": int(inf_nan),
}
)
return sorted(metrics, key=lambda x: x["step"])
def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
"""Parse evaluation results from log content."""
results = []
# Pattern to match evaluation results with timestamp and log level
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ paloma: ([\d.e+-]+)"
matches = re.findall(pattern, log_content)
for step, paloma in matches:
try:
paloma_value = float(paloma)
results.append({"step": int(step), "paloma": paloma_value})
except ValueError:
# Skip if paloma value is not a valid number (e.g., "inf")
continue
return sorted(results, key=lambda x: x["step"])
def extract_config_from_log(log_content: str) -> Dict[str, Any]:
"""Extract model configuration from log content."""
config = {}
# Extract key model parameters
patterns = {
"d_model": r"d_model: (\d+)",
"n_layers": r"n_layers: (\d+)",
"max_seq_len": r"max_seq_len: (\d+)",
"vocab_size": r"vocab_size: (\d+)",
"lr": r"lr: ([\d.e+-]+)",
"max_steps": r"max_steps: (\d+)",
"batch_size": r"batch_size: (\d+)",
}
for key, pattern in patterns.items():
match = re.search(pattern, log_content)
if match:
try:
if key in [
"d_model",
"n_layers",
"max_seq_len",
"vocab_size",
"max_steps",
"batch_size",
]:
config[key] = int(match.group(1))
else:
config[key] = float(match.group(1))
except ValueError:
continue
return config
def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
"""Process a single run directory and extract all data."""
run_name = run_path.name
# Find log files
logs_dir = run_path / "logs"
if not logs_dir.exists():
return None
log_files = list(logs_dir.glob("*.log"))
if not log_files:
return None
# Use the most recent log file for configuration
latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
# Read log content
log_content = latest_log.read_text(encoding="utf-8")
# Extract data
training_metrics = parse_training_metrics(log_content)
evaluation_results = parse_evaluation_results(log_content)
config = extract_config_from_log(log_content)
# If no training metrics found, skip this run
if not training_metrics:
return None
return {
"run_name": run_name,
"log_file": latest_log.name,
"training_metrics": training_metrics,
"evaluation_results": evaluation_results,
"config": config,
}
def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
"""Generate data.json from all run directories."""
runs_path = Path(runs_dir)
if not runs_path.exists():
print(f"Runs directory {runs_dir} not found!")
return
runs_data = []
# Process each run directory
for run_dir in runs_path.iterdir():
if run_dir.is_dir():
print(f"Processing run: {run_dir.name}")
run_data = process_run_directory(run_dir)
if run_data:
runs_data.append(run_data)
print(f" β Found {len(run_data['training_metrics'])} training metrics")
print(
f" β Found {len(run_data['evaluation_results'])} evaluation results"
)
else:
print(" β No valid data found")
if not runs_data:
print("No valid runs found!")
return
# Create output data structure
output_data = {
"runs": runs_data,
"summary": {
"total_runs": len(runs_data),
"run_names": [run["run_name"] for run in runs_data],
},
}
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write to file
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nβ Generated {output_file} with {len(runs_data)} runs")
print(
f"β Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
)
print(
f"β Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Generate data.json from training logs"
)
parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
parser.add_argument("--output", default="plots/data.json", help="Output file path")
args = parser.parse_args()
generate_data_json(args.runs_dir, args.output)
|