"""
Device Manager for BackgroundFX Pro
Handles device detection, optimization, and hardware compatibility
"""

# CRITICAL: Set OMP_NUM_THREADS before ANY other imports to prevent libgomp error
import os
if 'OMP_NUM_THREADS' not in os.environ:
    os.environ['OMP_NUM_THREADS'] = '4'
    os.environ['MKL_NUM_THREADS'] = '4'

import sys
import platform
import subprocess
import logging
from typing import Dict, Any, Optional, Tuple
from dataclasses import dataclass
from enum import Enum

import torch
import psutil
import cpuinfo

logger = logging.getLogger(__name__)


class DeviceType(Enum):
    """Enumeration of supported device types"""
    CUDA = "cuda"
    MPS = "mps"
    CPU = "cpu"


@dataclass
class DeviceInfo:
    """Information about a compute device"""
    type: DeviceType
    index: int
    name: str
    memory_total: int
    memory_available: int
    compute_capability: Optional[Tuple[int, int]] = None


class DeviceManager:
    """Manages compute devices and system optimization"""
    
    _instance = None
    
    def __init__(self):
        """Initialize device manager"""
        self.devices = []
        self.optimal_device = None
        self.cpu_info = None
        self.system_info = {}
        
        # Initialize device detection
        self._detect_devices()
        self._gather_system_info()
        self._determine_optimal_device()
    
    def _detect_devices(self):
        """Detect available compute devices"""
        self.devices = []
        
        # Check for CUDA devices
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
                self.devices.append(DeviceInfo(
                    type=DeviceType.CUDA,
                    index=i,
                    name=props.name,
                    memory_total=props.total_memory,
                    memory_available=props.total_memory - torch.cuda.memory_allocated(i),
                    compute_capability=(props.major, props.minor)
                ))
        
        # Check for MPS (Apple Silicon)
        if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
            # MPS doesn't provide detailed device info like CUDA
            self.devices.append(DeviceInfo(
                type=DeviceType.MPS,
                index=0,
                name="Apple Silicon GPU",
                memory_total=psutil.virtual_memory().total,
                memory_available=psutil.virtual_memory().available
            ))
        
        # CPU is always available
        try:
            cpu_info = cpuinfo.get_cpu_info()
            cpu_name = cpu_info.get('brand_raw', 'Unknown CPU')
        except:
            cpu_name = platform.processor() or "Unknown CPU"
        
        self.devices.append(DeviceInfo(
            type=DeviceType.CPU,
            index=0,
            name=cpu_name,
            memory_total=psutil.virtual_memory().total,
            memory_available=psutil.virtual_memory().available
        ))
    
    def _gather_system_info(self):
        """Gather system information"""
        try:
            cpu_info = cpuinfo.get_cpu_info()
            self.cpu_info = cpu_info
        except:
            self.cpu_info = {}
        
        self.system_info = {
            'platform': platform.system(),
            'platform_release': platform.release(),
            'platform_version': platform.version(),
            'architecture': platform.machine(),
            'processor': platform.processor(),
            'cpu_count': psutil.cpu_count(logical=False),
            'cpu_count_logical': psutil.cpu_count(logical=True),
            'ram_total': psutil.virtual_memory().total,
            'ram_available': psutil.virtual_memory().available,
            'python_version': sys.version,
            'torch_version': torch.__version__,
        }
    
    def _determine_optimal_device(self):
        """Determine the optimal device for computation"""
        # Priority: CUDA > MPS > CPU
        cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
        mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
        cpu_devices = [d for d in self.devices if d.type == DeviceType.CPU]
        
        if cuda_devices:
            # Choose CUDA device with most available memory
            self.optimal_device = max(cuda_devices, key=lambda d: d.memory_available)
        elif mps_devices:
            self.optimal_device = mps_devices[0]
        else:
            self.optimal_device = cpu_devices[0]
        
        logger.info(f"Optimal device: {self.optimal_device.name} ({self.optimal_device.type.value})")
    
    def get_optimal_device(self) -> str:
        """Get the optimal device string for PyTorch"""
        if self.optimal_device.type == DeviceType.CUDA:
            return f"cuda:{self.optimal_device.index}"
        elif self.optimal_device.type == DeviceType.MPS:
            return "mps"
        else:
            return "cpu"
    
    def fix_cuda_compatibility(self):
        """Apply CUDA compatibility fixes"""
        if not torch.cuda.is_available():
            logger.info("CUDA not available, skipping compatibility fixes")
            return
        
        try:
            # Set CUDA environment variables for better compatibility
            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
            
            # For older GPUs, enable TF32 for better performance
            if torch.cuda.is_available():
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True
                
                # Set memory fraction for stability
                if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ:
                    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
                
                logger.info("CUDA compatibility settings applied")
        except Exception as e:
            logger.warning(f"Error applying CUDA compatibility fixes: {e}")
    
    def setup_optimal_threading(self):
        """Configure optimal threading for the system"""
        try:
            # Skip if already configured (to avoid overwriting the early setting)
            current_omp = os.environ.get('OMP_NUM_THREADS')
            if current_omp and current_omp.isdigit() and int(current_omp) > 0:
                logger.info(f"Threading already configured: OMP_NUM_THREADS={current_omp}")
                # Just ensure PyTorch uses the same settings
                torch.set_num_threads(int(current_omp))
                
                # Ensure MKL matches OMP if it's not set
                if 'MKL_NUM_THREADS' not in os.environ:
                    os.environ['MKL_NUM_THREADS'] = current_omp
                
                return
            
            # Get physical CPU count
            physical_cores = psutil.cpu_count(logical=False)
            if physical_cores is None:
                physical_cores = 4  # Default fallback
            
            # Validate and set the number of threads
            num_threads = str(min(physical_cores, 8))  # Cap at 8 threads
            
            # Set OpenMP threads (validate the value is a positive integer)
            if num_threads.isdigit() and int(num_threads) > 0:
                os.environ['OMP_NUM_THREADS'] = num_threads
            else:
                os.environ['OMP_NUM_THREADS'] = '4'  # Safe default
            
            # Set MKL threads for Intel processors
            if 'intel' in self.system_info.get('processor', '').lower():
                os.environ['MKL_NUM_THREADS'] = os.environ['OMP_NUM_THREADS']
            
            # Set PyTorch threads
            torch.set_num_threads(int(os.environ['OMP_NUM_THREADS']))
            
            # For CUDA, set the number of threads for CPU operations
            if torch.cuda.is_available():
                torch.set_num_interop_threads(2)  # Inter-op parallelism
            
            logger.info(f"Threading configured: OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')}")
            
        except Exception as e:
            logger.warning(f"Error setting up threading: {e}")
            # Set safe defaults
            if 'OMP_NUM_THREADS' not in os.environ:
                os.environ['OMP_NUM_THREADS'] = '4'
            if 'MKL_NUM_THREADS' not in os.environ:
                os.environ['MKL_NUM_THREADS'] = '4'
    
    def get_system_diagnostics(self) -> Dict[str, Any]:
        """Get comprehensive system diagnostics"""
        diagnostics = {
            'system': self.system_info.copy(),
            'devices': [],
            'optimal_device': None,
            'threading': {
                'omp_num_threads': os.environ.get('OMP_NUM_THREADS', 'not set'),
                'mkl_num_threads': os.environ.get('MKL_NUM_THREADS', 'not set'),
                'torch_num_threads': torch.get_num_threads(),
            }
        }
        
        # Add device information
        for device in self.devices:
            device_info = {
                'type': device.type.value,
                'index': device.index,
                'name': device.name,
                'memory_total_gb': device.memory_total / (1024**3),
                'memory_available_gb': device.memory_available / (1024**3),
            }
            if device.compute_capability:
                device_info['compute_capability'] = f"{device.compute_capability[0]}.{device.compute_capability[1]}"
            diagnostics['devices'].append(device_info)
        
        # Add optimal device
        if self.optimal_device:
            diagnostics['optimal_device'] = {
                'type': self.optimal_device.type.value,
                'name': self.optimal_device.name,
                'pytorch_device': self.get_optimal_device()
            }
        
        # Add CUDA-specific diagnostics
        if torch.cuda.is_available():
            diagnostics['cuda'] = {
                'available': True,
                'version': torch.version.cuda,
                'device_count': torch.cuda.device_count(),
                'current_device': torch.cuda.current_device() if torch.cuda.is_initialized() else None,
            }
        else:
            diagnostics['cuda'] = {'available': False}
        
        # Add MPS-specific diagnostics
        if hasattr(torch.backends, 'mps'):
            diagnostics['mps'] = {
                'available': torch.backends.mps.is_available(),
                'built': torch.backends.mps.is_built()
            }
        else:
            diagnostics['mps'] = {'available': False}
        
        return diagnostics
    
    def get_device_for_model(self, model_size_gb: float = 2.0) -> str:
        """Get appropriate device based on model size requirements"""
        required_memory = model_size_gb * 1024**3 * 1.5  # 1.5x for overhead
        
        # Check CUDA devices first
        cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
        for device in cuda_devices:
            if device.memory_available > required_memory:
                return f"cuda:{device.index}"
        
        # Check MPS
        mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
        if mps_devices and mps_devices[0].memory_available > required_memory:
            return "mps"
        
        # Fallback to CPU
        return "cpu"


# Singleton instance holder
_device_manager_instance = None


def get_device_manager() -> DeviceManager:
    """Get or create the singleton DeviceManager instance"""
    global _device_manager_instance
    if _device_manager_instance is None:
        _device_manager_instance = DeviceManager()
    return _device_manager_instance


def get_optimal_device() -> str:
    """
    Get the optimal device string for PyTorch operations.
    
    Returns:
        str: Device string like 'cuda:0', 'mps', or 'cpu'
    """
    manager = get_device_manager()
    return manager.get_optimal_device()


def fix_cuda_compatibility():
    """
    Apply CUDA compatibility settings for stable operation.
    Sets environment variables and PyTorch settings for CUDA compatibility.
    """
    manager = get_device_manager()
    manager.fix_cuda_compatibility()


def setup_optimal_threading():
    """
    Configure optimal threading settings for the current system.
    Sets OMP_NUM_THREADS, MKL_NUM_THREADS, and PyTorch thread counts.
    """
    manager = get_device_manager()
    manager.setup_optimal_threading()


def get_system_diagnostics() -> Dict[str, Any]:
    """
    Get comprehensive system diagnostics information.
    
    Returns:
        Dict containing system info, device info, and configuration details
    """
    manager = get_device_manager()
    return manager.get_system_diagnostics()


# Initialize and configure on module import
if __name__ != "__main__":
    # When imported, automatically set up the device manager
    try:
        # Get the manager instance (threading is already configured at top of file)
        manager = get_device_manager()
        # Only run setup_optimal_threading if needed (it will check internally)
        manager.setup_optimal_threading()
    except Exception as e:
        logger.warning(f"Error during device manager initialization: {e}")