ThomasTheMaker's picture
Upload folder using huggingface_hub
feba2ad verified
raw
history blame
6.48 kB
#!/usr/bin/env python3
"""
Script to generate data.json from training log files.
This script parses log files from the runs directory and extracts:
- Training metrics (loss, learning rate, inf/nan count)
- Evaluation results (paloma metrics)
- Model configuration parameters
The output is saved to plots/data.json for the dashboard.
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
"""Parse training metrics from log content."""
metrics = []
# Pattern to match training metrics entries with timestamp and log level
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ”„ Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)"
matches = re.findall(pattern, log_content)
for step, loss, lr, inf_nan in matches:
metrics.append(
{
"step": int(step),
"loss": float(loss),
"learning_rate": float(lr),
"inf_nan_count": int(inf_nan),
}
)
return sorted(metrics, key=lambda x: x["step"])
def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
"""Parse evaluation results from log content."""
results = []
# Pattern to match evaluation results with timestamp and log level
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ“Š Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)"
matches = re.findall(pattern, log_content)
for step, paloma in matches:
try:
paloma_value = float(paloma)
results.append({"step": int(step), "paloma": paloma_value})
except ValueError:
# Skip if paloma value is not a valid number (e.g., "inf")
continue
return sorted(results, key=lambda x: x["step"])
def extract_config_from_log(log_content: str) -> Dict[str, Any]:
"""Extract model configuration from log content."""
config = {}
# Extract key model parameters
patterns = {
"d_model": r"d_model: (\d+)",
"n_layers": r"n_layers: (\d+)",
"max_seq_len": r"max_seq_len: (\d+)",
"vocab_size": r"vocab_size: (\d+)",
"lr": r"lr: ([\d.e+-]+)",
"max_steps": r"max_steps: (\d+)",
"batch_size": r"batch_size: (\d+)",
}
for key, pattern in patterns.items():
match = re.search(pattern, log_content)
if match:
try:
if key in [
"d_model",
"n_layers",
"max_seq_len",
"vocab_size",
"max_steps",
"batch_size",
]:
config[key] = int(match.group(1))
else:
config[key] = float(match.group(1))
except ValueError:
continue
return config
def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
"""Process a single run directory and extract all data."""
run_name = run_path.name
# Find log files
logs_dir = run_path / "logs"
if not logs_dir.exists():
return None
log_files = list(logs_dir.glob("*.log"))
if not log_files:
return None
# Use the most recent log file for configuration
latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
# Read log content
log_content = latest_log.read_text(encoding="utf-8")
# Extract data
training_metrics = parse_training_metrics(log_content)
evaluation_results = parse_evaluation_results(log_content)
config = extract_config_from_log(log_content)
# If no training metrics found, skip this run
if not training_metrics:
return None
return {
"run_name": run_name,
"log_file": latest_log.name,
"training_metrics": training_metrics,
"evaluation_results": evaluation_results,
"config": config,
}
def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
"""Generate data.json from all run directories."""
runs_path = Path(runs_dir)
if not runs_path.exists():
print(f"Runs directory {runs_dir} not found!")
return
runs_data = []
# Process each run directory
for run_dir in runs_path.iterdir():
if run_dir.is_dir():
print(f"Processing run: {run_dir.name}")
run_data = process_run_directory(run_dir)
if run_data:
runs_data.append(run_data)
print(f" βœ“ Found {len(run_data['training_metrics'])} training metrics")
print(
f" βœ“ Found {len(run_data['evaluation_results'])} evaluation results"
)
else:
print(" βœ— No valid data found")
if not runs_data:
print("No valid runs found!")
return
# Create output data structure
output_data = {
"runs": runs_data,
"summary": {
"total_runs": len(runs_data),
"run_names": [run["run_name"] for run in runs_data],
},
}
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write to file
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nβœ“ Generated {output_file} with {len(runs_data)} runs")
print(
f"βœ“ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
)
print(
f"βœ“ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Generate data.json from training logs"
)
parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
parser.add_argument("--output", default="plots/data.json", help="Output file path")
args = parser.parse_args()
generate_data_json(args.runs_dir, args.output)