File size: 10,856 Bytes
39db0ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbb337d
39db0ca
 
 
 
 
 
 
dbb337d
39db0ca
 
 
 
 
 
 
 
 
dbb337d
 
 
 
39db0ca
cb276d8
 
 
 
 
 
39db0ca
 
 
5f8b28d
 
 
 
 
39db0ca
 
 
 
 
 
 
 
 
 
 
 
 
 
5f8b28d
 
39db0ca
c560f4f
 
 
39db0ca
 
c560f4f
39db0ca
 
 
08ed534
 
39db0ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c560f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
39db0ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbc0479
 
 
 
 
 
 
 
 
 
 
 
 
5f8b28d
764a584
5fe0328
764a584
 
 
 
5fe0328
 
764a584
5fe0328
 
 
 
 
 
 
 
 
 
 
764a584
 
 
 
 
1919b3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbc0479
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""
Trackio Module Interface for TRL Library
Provides the interface expected by TRL library while integrating with our custom monitoring system
"""

import os
import logging
from typing import Dict, Any, Optional
from datetime import datetime

# Import our custom monitoring
from monitoring import SmolLM3Monitor

logger = logging.getLogger(__name__)

# Global monitor instance
_monitor = None

def init(
    project_name: Optional[str] = None,
    experiment_name: Optional[str] = None,
    **kwargs
) -> str:
    """
    Initialize trackio experiment (TRL interface)
    
    Args:
        project_name: Name of the project (optional, defaults to 'smollm3_experiment')
        experiment_name: Name of the experiment (optional)
        **kwargs: Additional configuration parameters
        
    Returns:
        Experiment ID
    """
    global _monitor
    
    try:
        # Provide default project name if not provided
        if project_name is None:
            project_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
        
        # Extract configuration from kwargs
        # Accept both TRACKIO_URL (full URL or org/space) and TRACKIO_SPACE_ID
        trackio_url = (
            kwargs.get('trackio_url')
            or os.environ.get('TRACKIO_URL')
            or os.environ.get('TRACKIO_SPACE_ID')
        )
        trackio_token = kwargs.get('trackio_token') or os.environ.get('TRACKIO_TOKEN')
        hf_token = kwargs.get('hf_token') or os.environ.get('HF_TOKEN')
        dataset_repo = kwargs.get('dataset_repo') or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
        monitoring_mode = (
            kwargs.get('monitoring_mode')
            or os.environ.get('MONITORING_MODE')
            or 'both'
        )
        
        # Use experiment_name if provided, otherwise use project_name
        exp_name = experiment_name or project_name
        
        # Create monitor instance
        _monitor = SmolLM3Monitor(
            experiment_name=exp_name,
            trackio_url=trackio_url,
            trackio_token=trackio_token,
            enable_tracking=True,
            log_artifacts=True,
            log_metrics=True,
            log_config=True,
            hf_token=hf_token,
            dataset_repo=dataset_repo,
            monitoring_mode=monitoring_mode,
        )
        # The monitor constructor creates the experiment remotely and sets
        # `experiment_id`. Do NOT overwrite it with a locally generated ID.
        experiment_id = getattr(_monitor, "experiment_id", None)
        logger.info(f"Trackio initialized for experiment: {exp_name}")
        logger.info(f"Experiment ID: {experiment_id}")
        return experiment_id or f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
    except Exception as e:
        logger.error(f"Failed to initialize trackio: {e}")
        # Return a fallback experiment ID - use the same format as our monitoring system
        return f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

def log(
    metrics: Dict[str, Any],
    step: Optional[int] = None,
    **kwargs
):
    """
    Log metrics to trackio (TRL interface)
    
    Args:
        metrics: Dictionary of metrics to log
        step: Current training step
        **kwargs: Additional parameters
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping log")
            return
        
        # Log metrics using our custom monitor
        _monitor.log_metrics(metrics, step)
        
        # Also log system metrics if available
        _monitor.log_system_metrics(step)
        
    except Exception as e:
        logger.error(f"Failed to log metrics: {e}")

def finish():
    """
    Finish trackio experiment (TRL interface)
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping finish")
            return
        
        # Close the monitoring session
        _monitor.close()
        
        logger.info("Trackio experiment finished")
        
    except Exception as e:
        logger.error(f"Failed to finish trackio experiment: {e}")

def set_monitor(monitor: SmolLM3Monitor) -> None:
    """Set the shared monitor instance used by this module.

    This allows external code (e.g., our trainer) to create a
    `SmolLM3Monitor` once and have `trackio.log/finish` operate on
    the exact same object, preventing mismatched experiment IDs.
    """
    global _monitor
    _monitor = monitor
    try:
        logger.info("trackio monitor set: experiment_id=%s", getattr(monitor, "experiment_id", None))
    except Exception:
        pass

def log_config(config: Dict[str, Any]):
    """
    Log configuration to trackio (TRL interface)
    
    Args:
        config: Configuration dictionary to log
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping config log")
            return
        
        # Log configuration using our custom monitor
        _monitor.log_configuration(config)
        
    except Exception as e:
        logger.error(f"Failed to log config: {e}")

def log_checkpoint(checkpoint_path: str, step: Optional[int] = None):
    """
    Log checkpoint to trackio (TRL interface)
    
    Args:
        checkpoint_path: Path to the checkpoint file
        step: Current training step
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping checkpoint log")
            return
        
        # Log checkpoint using our custom monitor
        _monitor.log_model_checkpoint(checkpoint_path, step)
        
    except Exception as e:
        logger.error(f"Failed to log checkpoint: {e}")

def log_evaluation_results(results: Dict[str, Any], step: Optional[int] = None):
    """
    Log evaluation results to trackio (TRL interface)
    
    Args:
        results: Evaluation results dictionary
        step: Current training step
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping evaluation log")
            return
        
        # Log evaluation results using our custom monitor
        _monitor.log_evaluation_results(results, step)
        
    except Exception as e:
        logger.error(f"Failed to log evaluation results: {e}")

# Additional utility functions for TRL compatibility
def get_experiment_url() -> Optional[str]:
    """Get the URL to view the experiment"""
    global _monitor
    
    if _monitor is not None:
        return _monitor.get_experiment_url()
    return None

def is_available() -> bool:
    """Check if trackio is available and initialized"""
    return _monitor is not None and _monitor.enable_tracking

def get_monitor():
    """Get the current monitor instance (for advanced usage)"""
    return _monitor

# Add config attribute for TRL compatibility
class TrackioConfig:
    """Configuration class for trackio (TRL compatibility)"""
    
    def __init__(self):
        self.project_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
        self.experiment_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
        self.trackio_url = os.environ.get('TRACKIO_URL')
        self.trackio_token = os.environ.get('TRACKIO_TOKEN')
        self.hf_token = os.environ.get('HF_TOKEN')
        self.dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
        self.monitoring_mode = os.environ.get('MONITORING_MODE', 'both')
    
    def update(self, config_dict: Dict[str, Any] = None, **kwargs):
        """
        Update configuration with new values (TRL compatibility)
        
        Args:
            config_dict: Dictionary of configuration values to update (optional)
            **kwargs: Additional configuration values to update
        """
        # Handle both dictionary and keyword arguments
        if config_dict is not None:
            for key, value in config_dict.items():
                if hasattr(self, key):
                    setattr(self, key, value)
                else:
                    # Add new attributes dynamically
                    setattr(self, key, value)
        
        # Handle keyword arguments
        for key, value in kwargs.items():
            if hasattr(self, key):
                setattr(self, key, value)
            else:
                # Add new attributes dynamically
                setattr(self, key, value)
    
    def __getitem__(self, key: str) -> Any:
        """
        Dictionary-style access to configuration values
        
        Args:
            key: Configuration key to access
            
        Returns:
            Configuration value
        """
        if hasattr(self, key):
            return getattr(self, key)
        else:
            raise KeyError(f"Configuration key '{key}' not found")
    
    def __setitem__(self, key: str, value: Any):
        """
        Dictionary-style assignment to configuration values
        
        Args:
            key: Configuration key to set
            value: Value to assign
        """
        setattr(self, key, value)
    
    def __contains__(self, key: str) -> bool:
        """
        Check if configuration key exists
        
        Args:
            key: Configuration key to check
            
        Returns:
            True if key exists, False otherwise
        """
        return hasattr(self, key)
    
    def get(self, key: str, default: Any = None) -> Any:
        """
        Get configuration value with default
        
        Args:
            key: Configuration key to access
            default: Default value if key doesn't exist
            
        Returns:
            Configuration value or default
        """
        if hasattr(self, key):
            return getattr(self, key)
        else:
            return default
    
    def keys(self):
        """
        Get all configuration keys
        
        Returns:
            List of configuration keys
        """
        # Use __dict__ to avoid recursion with dir()
        return list(self.__dict__.keys())
    
    def items(self):
        """
        Get all configuration key-value pairs
        
        Returns:
            List of (key, value) tuples
        """
        # Use __dict__ to avoid recursion
        return list(self.__dict__.items())
    
    def __repr__(self):
        """String representation of configuration"""
        # Use __dict__ to avoid recursion
        attrs = []
        for key, value in self.__dict__.items():
            attrs.append(f"{key}={repr(value)}")
        return f"TrackioConfig({', '.join(attrs)})"

# Create config instance
config = TrackioConfig()