print("|| RAM ||")
import yaml
import argparse
import torch
import random
import os
import numpy as np
from tqdm import tqdm
from models.vae import VAE
from torch.utils.data.dataloader import DataLoader
from torch.optim import Adam
from models.discriminator import Discriminator
import imageio.v3 as iio
import lpips
import gc
#import time
import OpenEXR
import Imath
import torchvision.utils as vutils

def strip_prefix_if_present(state_dict, prefix="_orig_mod."):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith(prefix):
            new_state_dict[k[len(prefix):]] = v
        else:
            new_state_dict[k] = v
    return new_state_dict


# Add this at the top of your script, before importing imageio
from dataloader_image_hyperism import HDRGrayscaleEXRDataset_new,ImageDataset,ImageDataset_d
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
#print("imported all the libraries")
import torch

#device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"Using device: {torch.cuda.get_device_name(device)} (CUDA:{torch.cuda.current_device()})")
else:
    print("Using device: CPU")

loss_fn_alex = lpips.LPIPS(net='vgg').to(device)
# Set the device to GPU if available

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau

import wandb
wandb.init(project="ldr_sh_vae_training_trial")

from torch.utils.data import Dataset

def check_nan(tensor, name="tensor"):
    """Check if tensor contains NaN values and print debugging info"""
    if torch.isnan(tensor).any():
        print(f"NaN detected in {name}")
        non_nan_mask = ~torch.isnan(tensor)
        if non_nan_mask.any():
            print(f"Non-NaN values stats - Min: {tensor[non_nan_mask].min().item()}, Max: {tensor[non_nan_mask].max().item()}")
        return True
    return False

def to_rgb(image):
    if image.shape[1] == 1:
        return image.repeat(1, 3, 1, 1)
    return image


def save_training_samples(output, gt_image, scene_infos, train_config, step_count, img_save_count):
    
    # Try to import OpenEXR if available
    try:
        has_openexr = True
    except ImportError:
        has_openexr = False
        print("Warning: OpenEXR not available, falling back to imageio")

    sample_size = min(8, output.shape[0])
    gt_image    = gt_image[:sample_size].detach().cpu()
    save_output = output[:sample_size].detach().cpu()#.numpy()

    # Apply normalization
    # epsilon = 1e-3  # Small value to prevent division by zero
    # save_output = np.where(save_output == -1, save_output + epsilon, save_output)
    # save_output = (1 - save_output) / (1 + save_output)  # Normalize to [0, 1]
    # save_output = torch.clip((save_output + 1) / 2.0, 0.0, 1.0)
    # Base save path
    base_save_path = os.path.join('/home/project/dataset/Hyperism', train_config['task_name'], 'vae_autoencoder_samples_1')

    def save_exr(filepath, data):
        """Helper function to save EXR files using either OpenEXR or imageio
        
        Args:
            filepath (str): Path to save the EXR file
            data (ndarray): Image data in shape (H, W), (H, W, C) or (C, H, W) format
            
        Returns:
            bool: True if save successful, False otherwise
        """
        # Handle (C, H, W) format by converting to (H, W, C)
        if len(data.shape) == 3 and (data.shape[0] == 3 or data.shape[0] == 1):
            if data.shape[0] == 3 and data.shape[1] > 3 and data.shape[2] > 3:  # Likely (C, H, W) format
                data = np.transpose(data, (1, 2, 0))
        
        if has_openexr and len(data.shape) == 2:
            # For grayscale images
            data_flat = data.astype(np.float32).tobytes()
            header = OpenEXR.Header(data.shape[1], data.shape[0])
            header['channels'] = {'Y': Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))}
            
            exr = OpenEXR.OutputFile(filepath, header)
            exr.writePixels({'Y': data_flat})
            exr.close()
            return True
        elif has_openexr and len(data.shape) == 3 and data.shape[2] == 3:
            # For RGB images in (H, W, C) format
            R = data[:,:,0].astype(np.float32).tobytes()
            G = data[:,:,1].astype(np.float32).tobytes()
            B = data[:,:,2].astype(np.float32).tobytes()
            
            header = OpenEXR.Header(data.shape[1], data.shape[0])
            header['channels'] = {
                'R': Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT)),
                'G': Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT)),
                'B': Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
            }
            
            exr = OpenEXR.OutputFile(filepath, header)
            exr.writePixels({'R': R, 'G': G, 'B': B})
            exr.close()
            return True
        else:
            # Fall back to imageio with tifffile plugin (which can handle EXR)
            try:
                # Ensure we're using a format imageio can handle
                if len(data.shape) == 3 and data.shape[0] == 3 and data.shape[1] > 3 and data.shape[2] > 3:
                    # Convert from (C, H, W) to (H, W, C) for imageio
                    data = np.transpose(data, (1, 2, 0))
                    
                iio.imwrite(filepath, data, plugin='tifffile', photometric='rgb')
                return True
            except Exception as e:
                print(f"Error saving with tifffile plugin: {e}")
                try:
                    # Last resort - try PNG instead of EXR
                    png_path = filepath.replace('.exr', '.png')
                    
                    # Ensure data is in correct shape for PNG
                    if len(data.shape) == 3 and data.shape[0] == 3 and data.shape[1] > 3 and data.shape[2] > 3:
                        data = np.transpose(data, (1, 2, 0))
                        
                    iio.imwrite(png_path, data)
                    print(f"Saved as PNG instead: {png_path}")
                    return True
                except Exception as e2:
                    print(f"Failed to save image: {e2}")
                    return False
                
 
    collage = torch.cat([save_output, gt_image], dim=0)
    os.makedirs("/home/project/dataset/Hyperism/ldr_to_sh_1/vae_autoencoder_samples/", exist_ok=True)
    output_path = f"/home/project/dataset/Hyperism/ldr_to_sh_1/vae_autoencoder_samples/{step_count}.png"
    vutils.save_image(collage, output_path, nrow=4, normalize=True)
    # Also save a simple numbered output for easy viewing
    simple_save_path = os.path.join(base_save_path, 'numbered_samples')
    os.makedirs(simple_save_path, exist_ok=True)

    return img_save_count + 1

# Create a combined dataset class
class CombinedDataset(Dataset):
    def __init__(self, sh_dataset):
        """
        A dataset that matches corresponding images across the three datasets based on scene metadata.
        
        Args:
            sh_dataset: The HDRGrayscaleEXRDataset for spherical harmonics shading
            albedo_dataset: The ImageDataset for albedo (diffuse_reflectance.exr)
            ldr_dataset: The ImageDataset for LDR input (dequantize.exr)
        """
        self.sh_dataset = sh_dataset
        #self.albedo_dataset = albedo_dataset
        # self.ldr_dataset = ldr_dataset
        
        # Create a mapping from scene info to indices for each dataset
        self.matching_indices = self._find_matching_indices()
       
    def _find_matching_indices(self):
        """Find matching indices across all three datasets based on scene info"""
        # Create dictionaries to map scene info to indices for each dataset
        sh_indices = {}
        #sh_indices = {}
       
        for idx in range(len(self.sh_dataset)):
            info = self.sh_dataset.get_scene_info(idx)
            key = (info['ai_folder'], info['scene_folder'], info['frame_num'])
            sh_indices[key] = idx
        
        sh_keys = set(sh_indices.keys())
        # ldr_keys = set(ldr_indices.keys())

        common_keys = sh_keys

        # Create a list of matching indices
        matching_indices = [
            (sh_indices[key]) 
            for key in common_keys
        ]
        
        return matching_indices
    
    def __len__(self):
        return len(self.matching_indices)
    
    def __getitem__(self, idx):
        # Get the matching indices for all three datasets
        sh_idx= self.matching_indices[idx]
        
        # Get the items from each dataset
        sh_image = self.sh_dataset[sh_idx]
        #albedo_image = self.sh_dataset[sh_idx]
        # ldr_image = self.ldr_dataset[ldr_idx]
        
        # Also store the scene info for saving output images
        info = self.sh_dataset.get_scene_info(sh_idx)

        return sh_image, info


def kl_divergence(mu, logvar):
    """
    Compute the KL divergence between the encoded distribution and a standard normal distribution.
    This version includes proper batch averaging and clipping to prevent numerical issues.
    """
    # Clamp logvar to prevent extreme values
    logvar = torch.clamp(logvar, min=-10, max=10)
    
    # Calculate KL divergence term by term
   
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    kl_loss = kl_loss / (logvar.size(0) *  logvar.size(1)* logvar.size(2) * logvar.size(3))
    # Average over batch dimension
    return kl_loss.mean()

# Function to evaluate the model on validation set
def validate(model, val_loader, discriminator, recon_criterion, disc_criterion, train_config, kl_weight,step_count, disc_step_start):
    model.eval()
    discriminator.eval()
    val_recon_losses_sh = []
    val_kl_losses = []
    val_perceptual_losses = []
    val_disc_losses = []
    val_gen_losses = []
    val_total_losses = []
    # For discriminator predictions
    val_real_preds = []
    val_fake_preds = []
    
    
    with torch.no_grad():
        for batch in val_loader:
            sh_im,_ = batch
            
            # Convert to float and move to device
            sh_im = sh_im.float().to(device)
            #sh_im = sh_im.float().to(device)
            #ldr_im = ldr_im.float().to(device)
                       
            # Get model output
            model_output = model(sh_im)
            output, z, _ = model_output
            mean, logvar = torch.chunk(z, 2, dim=1)
        
            # Calculate reconstruction loss for shading
            recon_loss = recon_criterion(output, sh_im)
            recon_loss = recon_loss / train_config['autoencoder_acc_steps']
            val_recon_losses_sh.append(train_config['sh_weight']*recon_loss.item())

            # Calculate KL loss
            kl_loss = kl_divergence(mean, logvar)
            kl_loss = kl_loss / train_config['autoencoder_acc_steps']
            val_kl_losses.append(kl_weight * kl_loss.item())
            
            output_rgb = to_rgb(output)
            sh_im_rgb = to_rgb(sh_im)
            # Calculate perceptual loss
            lpips_loss = (loss_fn_alex(output_rgb.to(device), sh_im_rgb.to(device)).mean()) # Ensure lpips_loss is a scalar
            lpips_loss = lpips_loss / train_config['autoencoder_acc_steps']
            val_perceptual_losses.append(train_config['perceptual_weight'] * lpips_loss.item())

            gen_loss = 0
            if step_count > disc_step_start :
                # Discriminator predictions
                disc_fake_pred = discriminator(output)
                disc_real_pred = discriminator(sh_im)

                # Store predictions
                real_probs = torch.sigmoid(disc_real_pred).mean().item()
                fake_probs = torch.sigmoid(disc_fake_pred).mean().item()
                val_real_preds.append(real_probs)
                val_fake_preds.append(fake_probs)

                # Generator adversarial loss
                gen_loss = disc_criterion(disc_fake_pred, torch.ones_like(disc_fake_pred))
                gen_loss = gen_loss / train_config['autoencoder_acc_steps']
                val_gen_losses.append(train_config['disc_weight'] * gen_loss.item())

                # Discriminator adversarial loss
                disc_fake_loss = disc_criterion(disc_fake_pred, torch.zeros_like(disc_fake_pred))
                disc_real_loss = disc_criterion(disc_real_pred, torch.ones_like(disc_real_pred))
                disc_loss = (disc_fake_loss + disc_real_loss) / 2
                disc_loss = disc_loss / train_config['autoencoder_acc_steps']
                val_disc_losses.append(train_config['disc_weight'] * disc_loss.item())

            # Calculate total loss
            total_loss = (train_config['sh_weight'] * recon_loss + #(train_config['gradient_weight'] * grad_loss) +
                         (kl_weight * kl_loss) +
                         (train_config['perceptual_weight'] * lpips_loss) +
                         (train_config['disc_weight'] * gen_loss))
            val_total_losses.append(total_loss.item())
    
    model.train()
    discriminator.train()

    
    # Return average losses
    return {
        'recon_loss_sh': np.mean(val_recon_losses_sh),
        'kl_loss': np.mean(val_kl_losses),
        #'gradient_loss_albedo': np.mean(val_gradient_losses_albedo),
        'perceptual_loss': np.mean(val_perceptual_losses),
        'gen_loss': np.mean(val_gen_losses) if val_gen_losses else 0,
        'disc_loss': np.mean(val_disc_losses) if val_disc_losses else 0,
        'total_loss': np.mean(val_total_losses),
        'real_prediction': np.mean(val_real_preds) if val_real_preds else 0,
        'fake_prediction': np.mean(val_fake_preds) if val_fake_preds else 0,
    }

def train(args):
    # Read the config file #
    with open(args.config_path, 'r') as file:
        try:
            config = yaml.safe_load(file)
        except yaml.YAMLError as exc:
            print(exc)
    #print(config)
    
    dataset_config = config['dataset_params_shading']
    autoencoder_config = config['autoencoder_params']
    train_config = config['train_params']
    #albedo_config = config['albedo_params']
    #ldr_input_config = config['dataset_params_input']

    # Set the desired seed value #
    seed = train_config['seed']
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)
    #############################
    
    # Create the model and dataset #
    model = VAE(latent_dim=8).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters in VAE: {total_params:,}")

    #model.apply(weights_init)

    discriminator = Discriminator(im_channels=dataset_config['im_channels']).to(device)
    #discriminator.apply(weights_init)

    sh_dataset = HDRGrayscaleEXRDataset_new(im_path=dataset_config['im_path'],
                               im_size=dataset_config['im_size'])
    
    
    # Create the combined dataset
    combined_dataset = CombinedDataset(sh_dataset)

    # Split dataset into train and validation (90:10 ratio)
    dataset_size = len(combined_dataset)
    train_size = int(0.95 * dataset_size)
    val_size = dataset_size - train_size
    

    indices = np.arange(len(combined_dataset))
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]

    train_dataset = torch.utils.data.Subset(combined_dataset, train_indices)
    val_dataset = torch.utils.data.Subset(combined_dataset, val_indices)

    print(f"Total dataset size: {dataset_size}")
    print(f"Training set size: {train_size}")
    print(f"Validation set size: {val_size}")
    
    wandb.config.update({
        "learning_rate_autoencoder": train_config['autoencoder_lr'],
        "learning_rate_discriminator": train_config['discriminator_lr'],
        "batch_size": train_config['autoencoder_batch_size'],
        "gradient_weight": train_config['gradient_weight'],
        "sh_weight": train_config['sh_weight'],
        "kl_weight": train_config['kl_weight'],
        "perceptual_weight": train_config['perceptual_weight'],
        "disc_weight": train_config['disc_weight'],
        "disc_start": train_config['disc_start'],
        "autoencoder_acc_steps": train_config['autoencoder_acc_steps']
    })

    train_loader = DataLoader(train_dataset,
                             batch_size=wandb.config['batch_size'],
                             shuffle=True , num_workers=16 , pin_memory=False)
    
    val_loader = DataLoader(val_dataset,
                           batch_size=wandb.config['batch_size'],
                           shuffle=False , num_workers=16 , pin_memory=False)
    
    # Create output directories
    if not os.path.exists(train_config['task_name']):
        os.makedirs(train_config['task_name'], exist_ok=True)
        
    num_epochs = train_config['autoencoder_epochs']

    # L1/L2 loss for Reconstruction
    recon_criterion = torch.nn.MSELoss()
    # Disc Loss can even be BCEWithLogits
    disc_criterion = torch.nn.BCEWithLogitsLoss()


    optimizer_d = Adam(discriminator.parameters(), lr=wandb.config['learning_rate_discriminator'], betas=(0.5, 0.999))
    optimizer_g = Adam(model.parameters(), lr=wandb.config['learning_rate_autoencoder'], betas=(0.5, 0.999) )

    scaler = GradScaler('cuda') if device == 'cuda' else None
    # Setup schedulers
    scheduler_g = ReduceLROnPlateau(optimizer_g, mode='min', factor=0.9, patience=5, min_lr=0.00001)
    scheduler_d = ReduceLROnPlateau(optimizer_d, mode='min', factor=0.9, patience=5, min_lr=0.000001)

    disc_step_start = wandb.config['disc_start']
    step_count = 0
    start_epoch = 0
    best_val_loss = float('inf')
    
    # This is for accumulating gradients incase the images are huge
    # And one cant afford higher batch sizes
    acc_steps = wandb.config['autoencoder_acc_steps']
    image_save_steps = train_config['autoencoder_img_save_steps']
    img_save_count = 0
    
    # Lists to store epoch metrics
    train_losses_history = []
    val_losses_history = []
    
    # Check if checkpoint exists and load it for resuming training
    # checkpoint_path = os.path.join(train_config['task_name'], 'epoch_10_best_autoencoder_model_checkpoint.pth')
    # if os.path.exists(checkpoint_path):
    #     logging.info(f"Loading checkpoint from {checkpoint_path}")
    #     checkpoint = torch.load(checkpoint_path, weights_only=False)
    #     model.load_state_dict(checkpoint['model_state_dict'])
    #     discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
    #     optimizer_g.load_state_dict(checkpoint['optimizer_g_state_dict'])
    #     optimizer_d.load_state_dict(checkpoint['optimizer_d_state_dict'])
    #     start_epoch = checkpoint['epoch']
    #     step_count = checkpoint.get('step_count', 0)
    #     img_save_count = checkpoint.get('img_save_count', 0)
    #     best_val_loss = checkpoint['best_val_loss']
    #     train_losses_history = checkpoint.get('train_losses_history', [])
    #     val_losses_history = checkpoint.get('val_losses_history', [])
    #     logging.info(f"Resuming from epoch {start_epoch} with best validation loss: {best_val_loss}")
    #checkpoint_path="/home/project/ldr_image_to_ldr_shading/LDR_image_to_LDR_shading_hyperism/train_vae_mithlesh/ldr_to_sh_st_15/epoch_5_best_autoencoder_model_checkpoint.pth"    
    # Load checkpoint
    checkpoint_path = os.path.join(train_config['task_name'], 'epoch_95_best_autoencoder_model_checkpoint.pth')
    if os.path.exists(checkpoint_path):
        logging.info(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, weights_only=False)

        # Remove _orig_mod. prefix from model and discriminator state_dicts
        model_state_dict = strip_prefix_if_present(checkpoint['model_state_dict'], '_orig_mod.')
        discriminator_state_dict = strip_prefix_if_present(checkpoint['discriminator_state_dict'], '_orig_mod.')

        model.load_state_dict(model_state_dict)
        discriminator.load_state_dict(discriminator_state_dict)

        optimizer_g.load_state_dict(checkpoint['optimizer_g_state_dict'])
        optimizer_d.load_state_dict(checkpoint['optimizer_d_state_dict'])

        start_epoch = checkpoint['epoch']
        step_count = checkpoint.get('step_count', 0)
        img_save_count = checkpoint.get('img_save_count', 0)
        best_val_loss = checkpoint['best_val_loss']
        train_losses_history = checkpoint.get('train_losses_history', [])
        val_losses_history = checkpoint.get('val_losses_history', [])
        logging.info(f"Resuming from epoch {start_epoch} with best validation loss: {best_val_loss}")
    
    # Check if the model is already compiled
    if hasattr(torch, 'compile'):
       model = torch.compile(model)
       discriminator = torch.compile(discriminator)
    

    #logging.info(f"Learning rates updated: Generator -> {new_lr_g}, Discriminator -> {new_lr_d}")
    for epoch_idx in range(start_epoch, num_epochs):
        # Training metrics
        #recon_losses_shading = []
        recon_losses_sh = []
        kl_losses = []
        perceptual_losses = []
        disc_losses = []
        gen_losses = []
        train_real_preds = []
        train_fake_preds = []
        #grad_losses_shading = []
        # grad_losses_albedo = []
        losses = []
        
        optimizer_g.zero_grad()
        optimizer_d.zero_grad()

        # Training loop
        for batch in tqdm(train_loader):

            #start_time = time.perf_counter()
            step_count += 1
                        
            # Unpack the batch - each element is a batch of images from each dataset
            sh_im, scene_infos = batch
            
            #sh_im = sh_im.float().to(device)
            sh_im = sh_im.float().to(device)

            with autocast(device_type='cuda'):
                # Fetch autoencoders output(reconstructions)
                model_output = model(sh_im)
                output, h,_ = model_output
                mean, logvar = torch.chunk(h, 2, dim=1)
                if check_nan(output, "raw_model_output"):
                    print("NaN values detected in model output! Skipping this batch.")
                    continue
                            
                # Image Saving Logic
                if step_count % image_save_steps == 0 or step_count == 1:
                    img_save_count = save_training_samples(
                        output, sh_im, scene_infos, train_config, step_count, img_save_count
                    )

                ######### Optimize Generator ##########
                # L2 Loss for shading
                recon_loss = recon_criterion(output, sh_im)
                recon_loss = recon_loss / acc_steps 
                recon_losses_sh.append(wandb.config['sh_weight'] * recon_loss.item()) # add average loss for 1 image

                kl_weight = wandb.config['kl_weight']
                kl_loss = kl_divergence(mean, logvar)
                kl_loss = kl_loss / acc_steps
                kl_losses.append(kl_weight * kl_loss.item())

                # total_loss_generator
                g_loss = (wandb.config['sh_weight'] * recon_loss + 
                        #   (wandb.config['gradient_weight'] * grad_loss) + 
                          (kl_weight * kl_loss ))
                # Adversarial loss only if disc_step_start steps passed
                if step_count > disc_step_start:
                    disc_fake_pred = discriminator(model_output[0])
                    disc_fake_loss = disc_criterion(disc_fake_pred,
                                                    torch.ones(disc_fake_pred.shape,
                                                            device=disc_fake_pred.device))
                    disc_fake_loss = disc_fake_loss / acc_steps
                    gen_losses.append(wandb.config['disc_weight'] * disc_fake_loss.item())
                    g_loss += wandb.config['disc_weight'] * disc_fake_loss
                # LPIPS Loss
                
                output_rgb = to_rgb(output)
                sh_im_rgb = to_rgb(sh_im)
                # Calculate perceptual loss
                lpips_loss = (loss_fn_alex(output_rgb, sh_im_rgb).mean())  # Ensure lpips_loss is a scalar
                lpips_loss = lpips_loss / acc_steps
                
                perceptual_losses.append(wandb.config['perceptual_weight'] * lpips_loss.item())
                g_loss += wandb.config['perceptual_weight'] * lpips_loss
                losses.append(g_loss.item())
                #g_loss.backward()
                #####################################
            
            if scaler is not None:
                scaler.scale(g_loss).backward()
            else:
                g_loss.backward()

            ######### Optimize Discriminator #######
            if step_count > disc_step_start and step_count % 2 == 0:
                with autocast(device_type='cuda'):
                    fake = output
                    disc_fake_pred = discriminator(fake.detach())
                    disc_real_pred = discriminator(sh_im)
                    disc_fake_loss = disc_criterion(disc_fake_pred,
                                                    torch.zeros(disc_fake_pred.shape,
                                                                device=disc_fake_pred.device))
                    disc_real_loss = disc_criterion(disc_real_pred,
                                                    torch.ones(disc_real_pred.shape,
                                                            device=disc_real_pred.device))
                    disc_loss = wandb.config['disc_weight'] * (disc_fake_loss + disc_real_loss) / 2
                    disc_loss = disc_loss / acc_steps
                    disc_losses.append(disc_loss.item())
                    with torch.no_grad():
                        # Convert logits to probabilities using sigmoid
                        real_probs = torch.sigmoid(disc_real_pred).mean().item()
                        fake_probs = torch.sigmoid(disc_fake_pred).mean().item()
                        train_real_preds.append(real_probs)
                        train_fake_preds.append(fake_probs)
                    
                # Scale the discriminator loss and backward
                if scaler is not None:
                    scaler.scale(disc_loss).backward()
                else:
                    disc_loss.backward()


                if step_count % acc_steps == 0:
                    # Apply gradient clipping (see below)
                    if scaler is not None:
                        # Unscale before clipping
                        scaler.unscale_(optimizer_d)
                        # Here we'll add gradient clipping (code below)
                        torch.nn.utils.clip_grad_norm_(discriminator.parameters(), max_norm=1.0)
                        # Step with scaler
                        scaler.step(optimizer_d)
                        scaler.update()
                    else:
                        # Here we'll add gradient clipping (code below)
                        torch.nn.utils.clip_grad_norm_(discriminator.parameters(), max_norm=1.0)
                        optimizer_d.step()
                    optimizer_d.zero_grad()    
                    #####################################
            
            
            if step_count % acc_steps == 0:
                # Apply gradient clipping (see below)
                if scaler is not None:
                    # Unscale before clipping
                    scaler.unscale_(optimizer_g)
                    # Here we'll add gradient clipping (code below)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    # Step with scaler
                    scaler.step(optimizer_g)
                    scaler.update()
                else:
                    # Here we'll add gradient clipping (code below)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer_g.step()
                optimizer_g.zero_grad()
        
        if step_count > disc_step_start and step_count % 2 == 0:
             # Final optimizer steps at end of epoch
            optimizer_d.step()
            optimizer_d.zero_grad()
        
        optimizer_g.step()
        
        
        # Calculate validation metrics  
        val_metrics = validate(model, val_loader, discriminator, recon_criterion, disc_criterion, train_config,kl_weight,step_count,disc_step_start)
        

        # Store epoch metrics for plotting
        train_loss = np.mean(losses)
        val_loss = val_metrics['total_loss']
        train_losses_history.append(train_loss)
        val_losses_history.append(val_loss)
        

        epochs_since_disc_start = max(0, epoch_idx - (disc_step_start // len(train_loader)))

        if step_count <= disc_step_start:
            # Before discriminator starts - continue normal generator training
            scheduler_g.step(val_loss)
        elif epochs_since_disc_start <= 20:
            # Stabilization period - don't adjust any learning rates
            scheduler_d.step(val_metrics["disc_loss"])
        else:
            # After stabilization - resume normal scheduling
            scheduler_g.step(val_loss)
            scheduler_d.step(val_metrics["disc_loss"])

        # After validation and calculating epoch metrics
        wandb.log({
            "epoch": epoch_idx + 1,
            "train/recon_loss_sh": np.mean(recon_losses_sh),
            #"train/gradient_loss_albedo": np.mean(grad_losses_albedo),
            "train/kl_loss": np.mean(kl_losses),
            "train/perceptual_loss": np.mean(perceptual_losses),
            "train/gen_loss": np.mean(gen_losses) if len(gen_losses) > 0 else 0,
            "train/disc_loss": np.mean(disc_losses) if len(disc_losses) > 0 else 0,
            "train/total_loss": train_loss,
            "train/real_prediction": np.mean(train_real_preds) if len(train_real_preds) > 0 else 0,
            "train/fake_prediction": np.mean(train_fake_preds) if len(train_fake_preds) > 0 else 0,
            "val/recon_loss_sh": val_metrics["recon_loss_sh"],
            #"val/gradient_loss_albedo": val_metrics["gradient_loss_albedo"],
            "val/kl_loss": val_metrics["kl_loss"],
            "val/perceptual_loss": val_metrics["perceptual_loss"],
            "val/gen_loss": val_metrics["gen_loss"],
            "val/disc_loss": val_metrics["disc_loss"],
            "val/total_loss": val_metrics["total_loss"],
            "val/real_prediction": val_metrics["real_prediction"],
            "val/fake_prediction": val_metrics["fake_prediction"],
            "learning_rate/generator": optimizer_g.param_groups[0]['lr'],
            "learning_rate/discriminator": optimizer_d.param_groups[0]['lr']
        })

        # Print epoch results
        print('\n' + '=' * 80)
        print(f'Epoch {epoch_idx + 1}/{num_epochs}')
        print('-' * 80)
        print('TRAINING:')
        print(f'Recon Loss_sh: {np.mean(recon_losses_sh):.4f} | '
              #f'Gradient Loss_albedo: {np.mean(grad_losses_albedo):.4f} | '
              f'KL Loss: {np.mean(kl_losses):.4f} | '
              f'Perceptual Loss: {np.mean(perceptual_losses):.4f}')
        
        if len(disc_losses) > 0 and len(gen_losses) > 0:
            print(f'Generator Loss: {np.mean(gen_losses):.4f} | '
                  f'Discriminator Loss: {np.mean(disc_losses):.4f}')
            
        print(f'Total Training Loss: {train_loss:.4f}')
        
        print('\nVALIDATION:')
        print(f'Recon Loss_shading: {val_metrics["recon_loss_sh"]:.4f} | '
              #f'Gradient Loss_shading: {val_metrics["gradient_loss_albedo"]:.4f} | '
              f'KL Loss: {val_metrics["kl_loss"]:.4f} | '
              f'Perceptual Loss: {val_metrics["perceptual_loss"]:.4f}')
        
        print(f'Generator Loss: {val_metrics["gen_loss"]:.4f} | '
              f'Discriminator Loss: {val_metrics["disc_loss"]:.4f}')
            
        print(f'Total Validation Loss: {val_metrics["total_loss"]:.4f}')
        
        # Check if validation loss improved
        if val_loss < best_val_loss:
            print(f"\nValidation loss improved from {best_val_loss:.4f} to {val_loss:.4f}. Saving model...")
            best_val_loss = val_loss
            # # Save only the best model checkpoint
            # checkpoint = {
            #     'epoch': epoch_idx + 1,
            #     'model_state_dict': model.state_dict(),
            #     'discriminator_state_dict': discriminator.state_dict(),
            #     'optimizer_g_state_dict': optimizer_g.state_dict(),
            #     'optimizer_d_state_dict': optimizer_d.state_dict(),
            #     'best_val_loss': best_val_loss,
            #     'step_count': step_count,
            #     'img_save_count': img_save_count,
            #     'train_losses_history': train_losses_history,
            #     'val_losses_history': val_losses_history
            # }
            
            # torch.save(checkpoint, os.path.join(train_config['task_name'], 'best_autoencoder_model_checkpoint.pth'))
            
            # # Save individual model files for compatibility with original code
            # torch.save(model.state_dict(), os.path.join(train_config['task_name'],
            #                                            train_config['vae_autoencoder_ckpt_name']))
            # torch.save(discriminator.state_dict(), os.path.join(train_config['task_name'],
            #
            
        else:
            print(f"Validation loss did not improve from {best_val_loss:.4f}")

        if (epoch_idx + 1) % 5 == 0:
            checkpoint = {
                'epoch': epoch_idx + 1,
                'model_state_dict': model.state_dict(),
                'discriminator_state_dict': discriminator.state_dict(),
                'optimizer_g_state_dict': optimizer_g.state_dict(),
                'optimizer_d_state_dict': optimizer_d.state_dict(),
                'best_val_loss': best_val_loss,
                'step_count': step_count,
                'img_save_count': img_save_count,
                'train_losses_history': train_losses_history,
                'val_losses_history': val_losses_history
            }

            torch.save(checkpoint, os.path.join(train_config['task_name'], f'epoch_{epoch_idx + 1}_best_autoencoder_model_checkpoint.pth'))

            # Save individual model files for compatibility with original code
            torch.save(model.state_dict(), os.path.join(train_config['task_name'],
                                                       train_config['vae_autoencoder_ckpt_name']))
            torch.save(discriminator.state_dict(), os.path.join(train_config['task_name'],
                                                               train_config['vae_discriminator_ckpt_name']))  # Every 10 epochs
        if epoch_idx % 1 == 0:  # Every 5 epochs
            gc.collect()
            torch.cuda.empty_cache()    
        print('=' * 80 + '\n')
    
    print('Done Training...')
    
    # Save final training history
    np.savez(os.path.join(train_config['task_name'], 'training_history.npz'),
             train_losses=np.array(train_losses_history),
             val_losses=np.array(val_losses_history))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Arguments for vae training')
    parser.add_argument('--config', dest='config_path',
                        default='config/autoen_alb_1.yaml', type=str)

    # Handle Jupyter/IPython arguments
    import sys
    args, unknown = parser.parse_known_args(sys.argv[1:])
    
    train(args)