|
|
|
""" |
|
Script to generate data.json from training log files. |
|
|
|
This script parses log files from the runs directory and extracts: |
|
- Training metrics (loss, learning rate, inf/nan count) |
|
- Evaluation results (paloma metrics) |
|
- Model configuration parameters |
|
|
|
The output is saved to plots/data.json for the dashboard. |
|
""" |
|
|
|
import json |
|
import re |
|
from pathlib import Path |
|
from typing import Any, Dict, List, Optional |
|
|
|
|
|
def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]: |
|
"""Parse training metrics from log content.""" |
|
metrics = [] |
|
|
|
|
|
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Inf/NaN count: (\d+)" |
|
|
|
matches = re.findall(pattern, log_content) |
|
|
|
for step, loss, lr, inf_nan in matches: |
|
metrics.append( |
|
{ |
|
"step": int(step), |
|
"loss": float(loss), |
|
"learning_rate": float(lr), |
|
"inf_nan_count": int(inf_nan), |
|
} |
|
) |
|
|
|
return sorted(metrics, key=lambda x: x["step"]) |
|
|
|
|
|
def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]: |
|
"""Parse evaluation results from log content.""" |
|
results = [] |
|
|
|
|
|
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ paloma: ([\d.e+-]+)" |
|
|
|
matches = re.findall(pattern, log_content) |
|
|
|
for step, paloma in matches: |
|
try: |
|
paloma_value = float(paloma) |
|
results.append({"step": int(step), "paloma": paloma_value}) |
|
except ValueError: |
|
|
|
continue |
|
|
|
return sorted(results, key=lambda x: x["step"]) |
|
|
|
|
|
def extract_config_from_log(log_content: str) -> Dict[str, Any]: |
|
"""Extract model configuration from log content.""" |
|
config = {} |
|
|
|
|
|
patterns = { |
|
"d_model": r"d_model: (\d+)", |
|
"n_layers": r"n_layers: (\d+)", |
|
"max_seq_len": r"max_seq_len: (\d+)", |
|
"vocab_size": r"vocab_size: (\d+)", |
|
"lr": r"lr: ([\d.e+-]+)", |
|
"max_steps": r"max_steps: (\d+)", |
|
"batch_size": r"batch_size: (\d+)", |
|
} |
|
|
|
for key, pattern in patterns.items(): |
|
match = re.search(pattern, log_content) |
|
if match: |
|
try: |
|
if key in [ |
|
"d_model", |
|
"n_layers", |
|
"max_seq_len", |
|
"vocab_size", |
|
"max_steps", |
|
"batch_size", |
|
]: |
|
config[key] = int(match.group(1)) |
|
else: |
|
config[key] = float(match.group(1)) |
|
except ValueError: |
|
continue |
|
|
|
return config |
|
|
|
|
|
def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]: |
|
"""Process a single run directory and extract all data.""" |
|
run_name = run_path.name |
|
|
|
|
|
logs_dir = run_path / "logs" |
|
if not logs_dir.exists(): |
|
return None |
|
|
|
log_files = list(logs_dir.glob("*.log")) |
|
if not log_files: |
|
return None |
|
|
|
|
|
latest_log = max(log_files, key=lambda x: x.stat().st_mtime) |
|
|
|
|
|
log_content = latest_log.read_text(encoding="utf-8") |
|
|
|
|
|
training_metrics = parse_training_metrics(log_content) |
|
evaluation_results = parse_evaluation_results(log_content) |
|
config = extract_config_from_log(log_content) |
|
|
|
|
|
if not training_metrics: |
|
return None |
|
|
|
return { |
|
"run_name": run_name, |
|
"log_file": latest_log.name, |
|
"training_metrics": training_metrics, |
|
"evaluation_results": evaluation_results, |
|
"config": config, |
|
} |
|
|
|
|
|
def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"): |
|
"""Generate data.json from all run directories.""" |
|
runs_path = Path(runs_dir) |
|
if not runs_path.exists(): |
|
print(f"Runs directory {runs_dir} not found!") |
|
return |
|
|
|
runs_data = [] |
|
|
|
|
|
for run_dir in runs_path.iterdir(): |
|
if run_dir.is_dir(): |
|
print(f"Processing run: {run_dir.name}") |
|
run_data = process_run_directory(run_dir) |
|
if run_data: |
|
runs_data.append(run_data) |
|
print(f" β Found {len(run_data['training_metrics'])} training metrics") |
|
print( |
|
f" β Found {len(run_data['evaluation_results'])} evaluation results" |
|
) |
|
else: |
|
print(" β No valid data found") |
|
|
|
if not runs_data: |
|
print("No valid runs found!") |
|
return |
|
|
|
|
|
output_data = { |
|
"runs": runs_data, |
|
"summary": { |
|
"total_runs": len(runs_data), |
|
"run_names": [run["run_name"] for run in runs_data], |
|
}, |
|
} |
|
|
|
|
|
output_path = Path(output_file) |
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f: |
|
json.dump(output_data, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"\nβ Generated {output_file} with {len(runs_data)} runs") |
|
print( |
|
f"β Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}" |
|
) |
|
print( |
|
f"β Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser( |
|
description="Generate data.json from training logs" |
|
) |
|
parser.add_argument("--runs-dir", default="runs", help="Path to runs directory") |
|
parser.add_argument("--output", default="plots/data.json", help="Output file path") |
|
|
|
args = parser.parse_args() |
|
|
|
generate_data_json(args.runs_dir, args.output) |
|
|