Spaces:
Running
Running
File size: 10,856 Bytes
39db0ca dbb337d 39db0ca dbb337d 39db0ca dbb337d 39db0ca cb276d8 39db0ca 5f8b28d 39db0ca 5f8b28d 39db0ca c560f4f 39db0ca c560f4f 39db0ca 08ed534 39db0ca c560f4f 39db0ca fbc0479 5f8b28d 764a584 5fe0328 764a584 5fe0328 764a584 5fe0328 764a584 1919b3b fbc0479 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
"""
Trackio Module Interface for TRL Library
Provides the interface expected by TRL library while integrating with our custom monitoring system
"""
import os
import logging
from typing import Dict, Any, Optional
from datetime import datetime
# Import our custom monitoring
from monitoring import SmolLM3Monitor
logger = logging.getLogger(__name__)
# Global monitor instance
_monitor = None
def init(
project_name: Optional[str] = None,
experiment_name: Optional[str] = None,
**kwargs
) -> str:
"""
Initialize trackio experiment (TRL interface)
Args:
project_name: Name of the project (optional, defaults to 'smollm3_experiment')
experiment_name: Name of the experiment (optional)
**kwargs: Additional configuration parameters
Returns:
Experiment ID
"""
global _monitor
try:
# Provide default project name if not provided
if project_name is None:
project_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
# Extract configuration from kwargs
# Accept both TRACKIO_URL (full URL or org/space) and TRACKIO_SPACE_ID
trackio_url = (
kwargs.get('trackio_url')
or os.environ.get('TRACKIO_URL')
or os.environ.get('TRACKIO_SPACE_ID')
)
trackio_token = kwargs.get('trackio_token') or os.environ.get('TRACKIO_TOKEN')
hf_token = kwargs.get('hf_token') or os.environ.get('HF_TOKEN')
dataset_repo = kwargs.get('dataset_repo') or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
monitoring_mode = (
kwargs.get('monitoring_mode')
or os.environ.get('MONITORING_MODE')
or 'both'
)
# Use experiment_name if provided, otherwise use project_name
exp_name = experiment_name or project_name
# Create monitor instance
_monitor = SmolLM3Monitor(
experiment_name=exp_name,
trackio_url=trackio_url,
trackio_token=trackio_token,
enable_tracking=True,
log_artifacts=True,
log_metrics=True,
log_config=True,
hf_token=hf_token,
dataset_repo=dataset_repo,
monitoring_mode=monitoring_mode,
)
# The monitor constructor creates the experiment remotely and sets
# `experiment_id`. Do NOT overwrite it with a locally generated ID.
experiment_id = getattr(_monitor, "experiment_id", None)
logger.info(f"Trackio initialized for experiment: {exp_name}")
logger.info(f"Experiment ID: {experiment_id}")
return experiment_id or f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
except Exception as e:
logger.error(f"Failed to initialize trackio: {e}")
# Return a fallback experiment ID - use the same format as our monitoring system
return f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
def log(
metrics: Dict[str, Any],
step: Optional[int] = None,
**kwargs
):
"""
Log metrics to trackio (TRL interface)
Args:
metrics: Dictionary of metrics to log
step: Current training step
**kwargs: Additional parameters
"""
global _monitor
try:
if _monitor is None:
logger.warning("Trackio not initialized, skipping log")
return
# Log metrics using our custom monitor
_monitor.log_metrics(metrics, step)
# Also log system metrics if available
_monitor.log_system_metrics(step)
except Exception as e:
logger.error(f"Failed to log metrics: {e}")
def finish():
"""
Finish trackio experiment (TRL interface)
"""
global _monitor
try:
if _monitor is None:
logger.warning("Trackio not initialized, skipping finish")
return
# Close the monitoring session
_monitor.close()
logger.info("Trackio experiment finished")
except Exception as e:
logger.error(f"Failed to finish trackio experiment: {e}")
def set_monitor(monitor: SmolLM3Monitor) -> None:
"""Set the shared monitor instance used by this module.
This allows external code (e.g., our trainer) to create a
`SmolLM3Monitor` once and have `trackio.log/finish` operate on
the exact same object, preventing mismatched experiment IDs.
"""
global _monitor
_monitor = monitor
try:
logger.info("trackio monitor set: experiment_id=%s", getattr(monitor, "experiment_id", None))
except Exception:
pass
def log_config(config: Dict[str, Any]):
"""
Log configuration to trackio (TRL interface)
Args:
config: Configuration dictionary to log
"""
global _monitor
try:
if _monitor is None:
logger.warning("Trackio not initialized, skipping config log")
return
# Log configuration using our custom monitor
_monitor.log_configuration(config)
except Exception as e:
logger.error(f"Failed to log config: {e}")
def log_checkpoint(checkpoint_path: str, step: Optional[int] = None):
"""
Log checkpoint to trackio (TRL interface)
Args:
checkpoint_path: Path to the checkpoint file
step: Current training step
"""
global _monitor
try:
if _monitor is None:
logger.warning("Trackio not initialized, skipping checkpoint log")
return
# Log checkpoint using our custom monitor
_monitor.log_model_checkpoint(checkpoint_path, step)
except Exception as e:
logger.error(f"Failed to log checkpoint: {e}")
def log_evaluation_results(results: Dict[str, Any], step: Optional[int] = None):
"""
Log evaluation results to trackio (TRL interface)
Args:
results: Evaluation results dictionary
step: Current training step
"""
global _monitor
try:
if _monitor is None:
logger.warning("Trackio not initialized, skipping evaluation log")
return
# Log evaluation results using our custom monitor
_monitor.log_evaluation_results(results, step)
except Exception as e:
logger.error(f"Failed to log evaluation results: {e}")
# Additional utility functions for TRL compatibility
def get_experiment_url() -> Optional[str]:
"""Get the URL to view the experiment"""
global _monitor
if _monitor is not None:
return _monitor.get_experiment_url()
return None
def is_available() -> bool:
"""Check if trackio is available and initialized"""
return _monitor is not None and _monitor.enable_tracking
def get_monitor():
"""Get the current monitor instance (for advanced usage)"""
return _monitor
# Add config attribute for TRL compatibility
class TrackioConfig:
"""Configuration class for trackio (TRL compatibility)"""
def __init__(self):
self.project_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
self.experiment_name = os.environ.get('EXPERIMENT_NAME', 'smollm3_experiment')
self.trackio_url = os.environ.get('TRACKIO_URL')
self.trackio_token = os.environ.get('TRACKIO_TOKEN')
self.hf_token = os.environ.get('HF_TOKEN')
self.dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
self.monitoring_mode = os.environ.get('MONITORING_MODE', 'both')
def update(self, config_dict: Dict[str, Any] = None, **kwargs):
"""
Update configuration with new values (TRL compatibility)
Args:
config_dict: Dictionary of configuration values to update (optional)
**kwargs: Additional configuration values to update
"""
# Handle both dictionary and keyword arguments
if config_dict is not None:
for key, value in config_dict.items():
if hasattr(self, key):
setattr(self, key, value)
else:
# Add new attributes dynamically
setattr(self, key, value)
# Handle keyword arguments
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
else:
# Add new attributes dynamically
setattr(self, key, value)
def __getitem__(self, key: str) -> Any:
"""
Dictionary-style access to configuration values
Args:
key: Configuration key to access
Returns:
Configuration value
"""
if hasattr(self, key):
return getattr(self, key)
else:
raise KeyError(f"Configuration key '{key}' not found")
def __setitem__(self, key: str, value: Any):
"""
Dictionary-style assignment to configuration values
Args:
key: Configuration key to set
value: Value to assign
"""
setattr(self, key, value)
def __contains__(self, key: str) -> bool:
"""
Check if configuration key exists
Args:
key: Configuration key to check
Returns:
True if key exists, False otherwise
"""
return hasattr(self, key)
def get(self, key: str, default: Any = None) -> Any:
"""
Get configuration value with default
Args:
key: Configuration key to access
default: Default value if key doesn't exist
Returns:
Configuration value or default
"""
if hasattr(self, key):
return getattr(self, key)
else:
return default
def keys(self):
"""
Get all configuration keys
Returns:
List of configuration keys
"""
# Use __dict__ to avoid recursion with dir()
return list(self.__dict__.keys())
def items(self):
"""
Get all configuration key-value pairs
Returns:
List of (key, value) tuples
"""
# Use __dict__ to avoid recursion
return list(self.__dict__.items())
def __repr__(self):
"""String representation of configuration"""
# Use __dict__ to avoid recursion
attrs = []
for key, value in self.__dict__.items():
attrs.append(f"{key}={repr(value)}")
return f"TrackioConfig({', '.join(attrs)})"
# Create config instance
config = TrackioConfig() |