File size: 6,480 Bytes
feba2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python3
"""
Script to generate data.json from training log files.

This script parses log files from the runs directory and extracts:
- Training metrics (loss, learning rate, inf/nan count)
- Evaluation results (paloma metrics)
- Model configuration parameters

The output is saved to plots/data.json for the dashboard.
"""

import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional


def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
    """Parse training metrics from log content."""
    metrics = []

    # Pattern to match training metrics entries with timestamp and log level
    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ”„ Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)"

    matches = re.findall(pattern, log_content)

    for step, loss, lr, inf_nan in matches:
        metrics.append(
            {
                "step": int(step),
                "loss": float(loss),
                "learning_rate": float(lr),
                "inf_nan_count": int(inf_nan),
            }
        )

    return sorted(metrics, key=lambda x: x["step"])


def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
    """Parse evaluation results from log content."""
    results = []

    # Pattern to match evaluation results with timestamp and log level
    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ“Š Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)"

    matches = re.findall(pattern, log_content)

    for step, paloma in matches:
        try:
            paloma_value = float(paloma)
            results.append({"step": int(step), "paloma": paloma_value})
        except ValueError:
            # Skip if paloma value is not a valid number (e.g., "inf")
            continue

    return sorted(results, key=lambda x: x["step"])


def extract_config_from_log(log_content: str) -> Dict[str, Any]:
    """Extract model configuration from log content."""
    config = {}

    # Extract key model parameters
    patterns = {
        "d_model": r"d_model: (\d+)",
        "n_layers": r"n_layers: (\d+)",
        "max_seq_len": r"max_seq_len: (\d+)",
        "vocab_size": r"vocab_size: (\d+)",
        "lr": r"lr: ([\d.e+-]+)",
        "max_steps": r"max_steps: (\d+)",
        "batch_size": r"batch_size: (\d+)",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, log_content)
        if match:
            try:
                if key in [
                    "d_model",
                    "n_layers",
                    "max_seq_len",
                    "vocab_size",
                    "max_steps",
                    "batch_size",
                ]:
                    config[key] = int(match.group(1))
                else:
                    config[key] = float(match.group(1))
            except ValueError:
                continue

    return config


def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
    """Process a single run directory and extract all data."""
    run_name = run_path.name

    # Find log files
    logs_dir = run_path / "logs"
    if not logs_dir.exists():
        return None

    log_files = list(logs_dir.glob("*.log"))
    if not log_files:
        return None

    # Use the most recent log file for configuration
    latest_log = max(log_files, key=lambda x: x.stat().st_mtime)

    # Read log content
    log_content = latest_log.read_text(encoding="utf-8")

    # Extract data
    training_metrics = parse_training_metrics(log_content)
    evaluation_results = parse_evaluation_results(log_content)
    config = extract_config_from_log(log_content)

    # If no training metrics found, skip this run
    if not training_metrics:
        return None

    return {
        "run_name": run_name,
        "log_file": latest_log.name,
        "training_metrics": training_metrics,
        "evaluation_results": evaluation_results,
        "config": config,
    }


def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
    """Generate data.json from all run directories."""
    runs_path = Path(runs_dir)
    if not runs_path.exists():
        print(f"Runs directory {runs_dir} not found!")
        return

    runs_data = []

    # Process each run directory
    for run_dir in runs_path.iterdir():
        if run_dir.is_dir():
            print(f"Processing run: {run_dir.name}")
            run_data = process_run_directory(run_dir)
            if run_data:
                runs_data.append(run_data)
                print(f"  βœ“ Found {len(run_data['training_metrics'])} training metrics")
                print(
                    f"  βœ“ Found {len(run_data['evaluation_results'])} evaluation results"
                )
            else:
                print("  βœ— No valid data found")

    if not runs_data:
        print("No valid runs found!")
        return

    # Create output data structure
    output_data = {
        "runs": runs_data,
        "summary": {
            "total_runs": len(runs_data),
            "run_names": [run["run_name"] for run in runs_data],
        },
    }

    # Ensure output directory exists
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\nβœ“ Generated {output_file} with {len(runs_data)} runs")
    print(
        f"βœ“ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
    )
    print(
        f"βœ“ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
    )


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Generate data.json from training logs"
    )
    parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
    parser.add_argument("--output", default="plots/data.json", help="Output file path")

    args = parser.parse_args()

    generate_data_json(args.runs_dir, args.output)