| | """ |
| | Activation Functions Comparison Experiment - Extended Training Dynamics Analysis |
| | |
| | Compares Linear, Sigmoid, ReLU, Leaky ReLU, and GELU activation functions |
| | on a deep neural network (10 hidden layers) for 1D non-linear regression. |
| | |
| | NEW FEATURES: |
| | - Gradient measurements at epochs 1, 100, and 200 |
| | - Training dynamics visualizations showing how activations evolve |
| | - Gradient flow evolution over training |
| | """ |
| |
|
| | import numpy as np |
| | import torch |
| | import torch.nn as nn |
| | import torch.optim as optim |
| | import matplotlib.pyplot as plt |
| | import json |
| | import os |
| | from datetime import datetime |
| |
|
| | |
| | np.random.seed(42) |
| | torch.manual_seed(42) |
| |
|
| | |
| | os.makedirs('activation_functions', exist_ok=True) |
| |
|
| | print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting Activation Functions - Training Dynamics Experiment") |
| | print("=" * 70) |
| |
|
| | |
| | |
| | |
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating synthetic dataset...") |
| |
|
| | x = np.linspace(-np.pi, np.pi, 200) |
| | y = np.sin(x) + np.random.normal(0, 0.1, 200) |
| |
|
| | |
| | X_train = torch.tensor(x, dtype=torch.float32).reshape(-1, 1) |
| | Y_train = torch.tensor(y, dtype=torch.float32).reshape(-1, 1) |
| |
|
| | |
| | x_eval = np.linspace(-np.pi, np.pi, 500) |
| | X_eval = torch.tensor(x_eval, dtype=torch.float32).reshape(-1, 1) |
| | y_true = np.sin(x_eval) |
| |
|
| | print(f" Training samples: {len(X_train)}") |
| | print(f" Evaluation samples: {len(X_eval)}") |
| |
|
| | |
| | |
| | |
| | class DeepMLP(nn.Module): |
| | """ |
| | Deep MLP with 10 hidden layers of 64 neurons each. |
| | Stores intermediate activations and gradients for analysis. |
| | """ |
| | def __init__(self, activation_fn=None, activation_name="linear"): |
| | super(DeepMLP, self).__init__() |
| | self.activation_name = activation_name |
| | |
| | |
| | self.input_layer = nn.Linear(1, 64) |
| | |
| | |
| | self.hidden_layers = nn.ModuleList([ |
| | nn.Linear(64, 64) for _ in range(10) |
| | ]) |
| | |
| | |
| | self.output_layer = nn.Linear(64, 1) |
| | |
| | |
| | self.activation_fn = activation_fn |
| | |
| | |
| | self.activations = {} |
| | |
| | def forward(self, x, store_activations=False): |
| | |
| | x = self.input_layer(x) |
| | if self.activation_fn is not None: |
| | x = self.activation_fn(x) |
| | |
| | |
| | for i, layer in enumerate(self.hidden_layers): |
| | x = layer(x) |
| | if self.activation_fn is not None: |
| | x = self.activation_fn(x) |
| | |
| | |
| | if store_activations: |
| | self.activations[f'layer_{i+1}'] = x.detach().clone() |
| | |
| | |
| | x = self.output_layer(x) |
| | return x |
| | |
| | def get_gradient_magnitudes(self): |
| | """Get average gradient magnitude for each hidden layer.""" |
| | magnitudes = [] |
| | for i, layer in enumerate(self.hidden_layers): |
| | if layer.weight.grad is not None: |
| | mag = layer.weight.grad.abs().mean().item() |
| | magnitudes.append(mag) |
| | else: |
| | magnitudes.append(0.0) |
| | return magnitudes |
| | |
| | def get_weight_stats(self): |
| | """Get weight statistics for each hidden layer.""" |
| | stats = [] |
| | for i, layer in enumerate(self.hidden_layers): |
| | w = layer.weight.data |
| | stats.append({ |
| | 'mean': w.mean().item(), |
| | 'std': w.std().item(), |
| | 'min': w.min().item(), |
| | 'max': w.max().item() |
| | }) |
| | return stats |
| |
|
| |
|
| | def create_model(activation_type): |
| | """Create a model with the specified activation function.""" |
| | if activation_type == "linear": |
| | return DeepMLP(activation_fn=None, activation_name="linear") |
| | elif activation_type == "sigmoid": |
| | return DeepMLP(activation_fn=torch.sigmoid, activation_name="sigmoid") |
| | elif activation_type == "relu": |
| | return DeepMLP(activation_fn=torch.relu, activation_name="relu") |
| | elif activation_type == "leaky_relu": |
| | return DeepMLP(activation_fn=nn.LeakyReLU(0.01), activation_name="leaky_relu") |
| | elif activation_type == "gelu": |
| | return DeepMLP(activation_fn=nn.GELU(), activation_name="gelu") |
| | else: |
| | raise ValueError(f"Unknown activation type: {activation_type}") |
| |
|
| |
|
| | |
| | |
| | |
| | def train_model(model, X_train, Y_train, X_eval, epochs=500, lr=0.001): |
| | """ |
| | Train a model and collect comprehensive metrics. |
| | |
| | Returns: |
| | - loss_history: List of losses per epoch |
| | - gradient_history: Dict of gradient magnitudes at key epochs (1, 100, 200) |
| | - activation_history: Activations at various epochs |
| | - weight_history: Weight statistics over training |
| | - prediction_history: Model predictions at key epochs |
| | """ |
| | optimizer = optim.Adam(model.parameters(), lr=lr) |
| | criterion = nn.MSELoss() |
| | |
| | loss_history = [] |
| | gradient_history = {} |
| | activation_history = {} |
| | weight_history = {} |
| | prediction_history = {} |
| | |
| | |
| | gradient_epochs = [1, 100, 200] |
| | activation_epochs = [0, 50, 100, 150, 200, 300, 400, 499] |
| | prediction_epochs = [0, 50, 100, 200, 300, 499] |
| | |
| | for epoch in range(epochs): |
| | model.train() |
| | optimizer.zero_grad() |
| | |
| | |
| | store_acts = epoch in activation_epochs |
| | predictions = model(X_train, store_activations=store_acts) |
| | |
| | |
| | loss = criterion(predictions, Y_train) |
| | |
| | |
| | loss.backward() |
| | |
| | |
| | if epoch in gradient_epochs: |
| | gradient_history[epoch] = model.get_gradient_magnitudes() |
| | print(f" [Gradient Capture] Epoch {epoch}: Layer 1={gradient_history[epoch][0]:.2e}, Layer 10={gradient_history[epoch][9]:.2e}") |
| | |
| | |
| | optimizer.step() |
| | |
| | |
| | loss_history.append(loss.item()) |
| | |
| | |
| | if store_acts: |
| | activation_history[epoch] = { |
| | k: v.numpy().copy() for k, v in model.activations.items() |
| | } |
| | |
| | |
| | if epoch % 50 == 0: |
| | weight_history[epoch] = model.get_weight_stats() |
| | |
| | |
| | if epoch in prediction_epochs: |
| | model.eval() |
| | with torch.no_grad(): |
| | pred = model(X_eval) |
| | prediction_history[epoch] = pred.numpy().flatten() |
| | model.train() |
| | |
| | |
| | if epoch % 100 == 0 or epoch == epochs - 1: |
| | print(f" Epoch {epoch:4d}/{epochs}: Loss = {loss.item():.6f}") |
| | |
| | return loss_history, gradient_history, activation_history, weight_history, prediction_history |
| |
|
| |
|
| | |
| | |
| | |
| | activation_types = ["linear", "sigmoid", "relu", "leaky_relu", "gelu"] |
| | activation_labels = { |
| | "linear": "Linear (None)", |
| | "sigmoid": "Sigmoid", |
| | "relu": "ReLU", |
| | "leaky_relu": "Leaky ReLU", |
| | "gelu": "GELU" |
| | } |
| |
|
| | results = {} |
| |
|
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training models with extended metrics...") |
| | print("=" * 70) |
| |
|
| | for act_type in activation_types: |
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training {activation_labels[act_type]} model...") |
| | |
| | model = create_model(act_type) |
| | loss_history, grad_history, act_history, weight_history, pred_history = train_model( |
| | model, X_train, Y_train, X_eval, epochs=500, lr=0.001 |
| | ) |
| | |
| | |
| | model.eval() |
| | with torch.no_grad(): |
| | final_predictions = model(X_eval, store_activations=True) |
| | |
| | results[act_type] = { |
| | "model": model, |
| | "loss_history": loss_history, |
| | "gradient_history": grad_history, |
| | "activation_history": act_history, |
| | "weight_history": weight_history, |
| | "prediction_history": pred_history, |
| | "final_predictions": final_predictions.numpy().flatten(), |
| | "final_activations": {k: v.numpy().copy() for k, v in model.activations.items()}, |
| | "final_loss": loss_history[-1] |
| | } |
| | |
| | print(f" Final MSE Loss: {loss_history[-1]:.6f}") |
| |
|
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All models trained!") |
| |
|
| | |
| | |
| | |
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Saving extended data...") |
| |
|
| | |
| | gradient_data = {} |
| | for act_type in activation_types: |
| | gradient_data[act_type] = { |
| | str(epoch): grads for epoch, grads in results[act_type]["gradient_history"].items() |
| | } |
| | with open('activation_functions/gradient_magnitudes_epochs.json', 'w') as f: |
| | json.dump(gradient_data, f, indent=2) |
| |
|
| | |
| | loss_data = { |
| | act_type: results[act_type]["loss_history"] |
| | for act_type in activation_types |
| | } |
| | with open('activation_functions/loss_histories.json', 'w') as f: |
| | json.dump(loss_data, f, indent=2) |
| |
|
| | |
| | final_losses = { |
| | act_type: results[act_type]["final_loss"] |
| | for act_type in activation_types |
| | } |
| | with open('activation_functions/final_losses.json', 'w') as f: |
| | json.dump(final_losses, f, indent=2) |
| |
|
| | print(" Saved: gradient_magnitudes_epochs.json, loss_histories.json, final_losses.json") |
| |
|
| | |
| | |
| | |
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating visualizations...") |
| |
|
| | |
| | plt.style.use('seaborn-v0_8-whitegrid') |
| | colors = { |
| | "linear": "#1f77b4", |
| | "sigmoid": "#ff7f0e", |
| | "relu": "#2ca02c", |
| | "leaky_relu": "#d62728", |
| | "gelu": "#9467bd" |
| | } |
| |
|
| | |
| | print(" Creating learned_functions.png...") |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| |
|
| | |
| | ax.plot(x_eval, y_true, 'k-', linewidth=2.5, label='Ground Truth (sin(x))', zorder=10) |
| |
|
| | |
| | ax.scatter(x, y, c='gray', alpha=0.5, s=30, label='Noisy Data', zorder=5) |
| |
|
| | |
| | for act_type in activation_types: |
| | ax.plot(x_eval, results[act_type]["final_predictions"], |
| | color=colors[act_type], linewidth=2, |
| | label=f'{activation_labels[act_type]} (MSE: {results[act_type]["final_loss"]:.4f})', |
| | alpha=0.8) |
| |
|
| | ax.set_xlabel('x', fontsize=12) |
| | ax.set_ylabel('y', fontsize=12) |
| | ax.set_title('Learned Functions: Comparison of Activation Functions\n(10 Hidden Layers, 64 Neurons Each, 500 Epochs)', fontsize=14) |
| | ax.legend(loc='upper right', fontsize=10) |
| | ax.set_xlim(-np.pi, np.pi) |
| | ax.set_ylim(-1.5, 1.5) |
| | ax.grid(True, alpha=0.3) |
| |
|
| | plt.tight_layout() |
| | plt.savefig('activation_functions/learned_functions.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating loss_curves.png...") |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| |
|
| | for act_type in activation_types: |
| | ax.plot(results[act_type]["loss_history"], |
| | color=colors[act_type], linewidth=2, |
| | label=f'{activation_labels[act_type]}') |
| |
|
| | ax.set_xlabel('Epoch', fontsize=12) |
| | ax.set_ylabel('MSE Loss', fontsize=12) |
| | ax.set_title('Training Loss Curves: Comparison of Activation Functions', fontsize=14) |
| | ax.legend(loc='upper right', fontsize=10) |
| | ax.set_yscale('log') |
| | ax.grid(True, alpha=0.3) |
| |
|
| | plt.tight_layout() |
| | plt.savefig('activation_functions/loss_curves.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating gradient_flow_epochs.png...") |
| | fig, axes = plt.subplots(1, 3, figsize=(18, 6)) |
| |
|
| | gradient_epochs = [1, 100, 200] |
| | layer_indices = list(range(1, 11)) |
| |
|
| | for idx, epoch in enumerate(gradient_epochs): |
| | ax = axes[idx] |
| | bar_width = 0.15 |
| | x_positions = np.arange(len(layer_indices)) |
| | |
| | for i, act_type in enumerate(activation_types): |
| | grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10) |
| | offset = (i - 2) * bar_width |
| | bars = ax.bar(x_positions + offset, grad_mags, bar_width, |
| | label=activation_labels[act_type] if idx == 0 else "", |
| | color=colors[act_type], alpha=0.8) |
| | |
| | ax.set_xlabel('Hidden Layer', fontsize=11) |
| | ax.set_ylabel('Avg Gradient Magnitude', fontsize=11) |
| | ax.set_title(f'Epoch {epoch}', fontsize=13, fontweight='bold') |
| | ax.set_xticks(x_positions) |
| | ax.set_xticklabels([f'L{i}' for i in layer_indices], fontsize=9) |
| | ax.set_yscale('log') |
| | ax.grid(True, alpha=0.3, axis='y') |
| | ax.set_ylim(1e-12, 1e0) |
| |
|
| | |
| | axes[0].legend(loc='upper right', fontsize=9) |
| |
|
| | fig.suptitle('Gradient Flow Analysis Across Training\n(Gradient Magnitude per Layer at Epochs 1, 100, 200)', fontsize=14, y=1.02) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/gradient_flow_epochs.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating gradient_flow.png...") |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| |
|
| | bar_width = 0.15 |
| | x_positions = np.arange(len(layer_indices)) |
| |
|
| | for i, act_type in enumerate(activation_types): |
| | grad_mags = results[act_type]["gradient_history"].get(1, [0]*10) |
| | offset = (i - 2) * bar_width |
| | bars = ax.bar(x_positions + offset, grad_mags, bar_width, |
| | label=activation_labels[act_type], color=colors[act_type], alpha=0.8) |
| |
|
| | ax.set_xlabel('Hidden Layer', fontsize=12) |
| | ax.set_ylabel('Average Gradient Magnitude', fontsize=12) |
| | ax.set_title('Gradient Flow Analysis: Average Gradient Magnitude per Layer\n(Measured at Epoch 1)', fontsize=14) |
| | ax.set_xticks(x_positions) |
| | ax.set_xticklabels([f'Layer {i}' for i in layer_indices]) |
| | ax.legend(loc='upper right', fontsize=10) |
| | ax.set_yscale('log') |
| | ax.grid(True, alpha=0.3, axis='y') |
| |
|
| | plt.tight_layout() |
| | plt.savefig('activation_functions/gradient_flow.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating hidden_activations.png...") |
| | fig, axes = plt.subplots(3, 5, figsize=(18, 12)) |
| |
|
| | layers_to_plot = ['layer_1', 'layer_5', 'layer_10'] |
| | layer_titles = ['Layer 1 (First)', 'Layer 5 (Middle)', 'Layer 10 (Last)'] |
| |
|
| | for row, (layer_key, layer_title) in enumerate(zip(layers_to_plot, layer_titles)): |
| | for col, act_type in enumerate(activation_types): |
| | ax = axes[row, col] |
| | |
| | |
| | activations = results[act_type]["final_activations"].get(layer_key, None) |
| | |
| | if activations is not None: |
| | |
| | ax.hist(activations.flatten(), bins=50, color=colors[act_type], |
| | alpha=0.7, edgecolor='black', linewidth=0.5) |
| | |
| | |
| | mean_val = activations.mean() |
| | std_val = activations.std() |
| | ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5) |
| | |
| | ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10) |
| | ax.set_xlabel('Activation Value', fontsize=8) |
| | ax.set_ylabel('Frequency', fontsize=8) |
| | |
| | |
| | textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}' |
| | props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) |
| | ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8, |
| | verticalalignment='top', horizontalalignment='right', bbox=props) |
| | else: |
| | ax.text(0.5, 0.5, 'No Data', ha='center', va='center', transform=ax.transAxes) |
| | ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10) |
| |
|
| | fig.suptitle('Hidden Layer Activation Distributions (After Training)', fontsize=14, y=1.02) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/hidden_activations.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating training_dynamics_functions.png...") |
| | fig, axes = plt.subplots(2, 3, figsize=(16, 10)) |
| | axes = axes.flatten() |
| |
|
| | |
| | prediction_epochs = [0, 50, 100, 200, 300, 499] |
| | epoch_colors = plt.cm.viridis(np.linspace(0, 1, len(prediction_epochs))) |
| |
|
| | for idx, act_type in enumerate(activation_types): |
| | ax = axes[idx] |
| | |
| | |
| | ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7) |
| | |
| | |
| | for ep_idx, epoch in enumerate(prediction_epochs): |
| | if epoch in results[act_type]["prediction_history"]: |
| | pred = results[act_type]["prediction_history"][epoch] |
| | ax.plot(x_eval, pred, color=epoch_colors[ep_idx], linewidth=1.5, |
| | label=f'Epoch {epoch}', alpha=0.8) |
| | |
| | ax.set_xlabel('x', fontsize=10) |
| | ax.set_ylabel('y', fontsize=10) |
| | ax.set_title(f'{activation_labels[act_type]}', fontsize=12, fontweight='bold') |
| | ax.set_xlim(-np.pi, np.pi) |
| | ax.set_ylim(-2, 2) |
| | ax.grid(True, alpha=0.3) |
| | ax.legend(loc='upper right', fontsize=7) |
| |
|
| | |
| | axes[5].axis('off') |
| |
|
| | fig.suptitle('Training Dynamics: How Each Activation Learns the Function Over Time', fontsize=14, y=1.02) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/training_dynamics_functions.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating gradient_evolution.png...") |
| | fig, axes = plt.subplots(1, 2, figsize=(14, 6)) |
| |
|
| | |
| | ax1 = axes[0] |
| | gradient_epochs = [1, 100, 200] |
| | x_pos = np.arange(len(gradient_epochs)) |
| | bar_width = 0.15 |
| |
|
| | for i, act_type in enumerate(activation_types): |
| | ratios = [] |
| | for epoch in gradient_epochs: |
| | grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10) |
| | |
| | if grads[0] > 1e-15: |
| | ratio = grads[9] / grads[0] |
| | else: |
| | ratio = 1e10 |
| | ratios.append(ratio) |
| | |
| | offset = (i - 2) * bar_width |
| | ax1.bar(x_pos + offset, ratios, bar_width, label=activation_labels[act_type], |
| | color=colors[act_type], alpha=0.8) |
| |
|
| | ax1.set_xlabel('Epoch', fontsize=12) |
| | ax1.set_ylabel('Gradient Ratio (Layer 10 / Layer 1)', fontsize=12) |
| | ax1.set_title('Gradient Ratio Evolution\n(Higher = More Vanishing)', fontsize=13) |
| | ax1.set_xticks(x_pos) |
| | ax1.set_xticklabels([f'Epoch {e}' for e in gradient_epochs]) |
| | ax1.set_yscale('log') |
| | ax1.axhline(y=1, color='black', linestyle='--', linewidth=1, label='Ideal (ratio=1)') |
| | ax1.legend(loc='upper left', fontsize=9) |
| | ax1.grid(True, alpha=0.3, axis='y') |
| |
|
| | |
| | ax2 = axes[1] |
| |
|
| | for act_type in activation_types: |
| | layer1_grads = [] |
| | for epoch in gradient_epochs: |
| | grads = results[act_type]["gradient_history"].get(epoch, [0]*10) |
| | layer1_grads.append(grads[0]) |
| | |
| | ax2.plot(gradient_epochs, layer1_grads, 'o-', color=colors[act_type], |
| | linewidth=2, markersize=8, label=activation_labels[act_type]) |
| |
|
| | ax2.set_xlabel('Epoch', fontsize=12) |
| | ax2.set_ylabel('Layer 1 Gradient Magnitude', fontsize=12) |
| | ax2.set_title('First Layer Gradient Over Training\n(Key Indicator of Learning)', fontsize=13) |
| | ax2.set_yscale('log') |
| | ax2.legend(loc='upper right', fontsize=9) |
| | ax2.grid(True, alpha=0.3) |
| |
|
| | fig.suptitle('Activation Effect on Gradient Dynamics During Training', fontsize=14, y=1.02) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/gradient_evolution.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating activation_evolution.png...") |
| | fig, axes = plt.subplots(5, 4, figsize=(16, 18)) |
| |
|
| | |
| | epochs_to_show = [0, 100, 200, 499] |
| |
|
| | for row, act_type in enumerate(activation_types): |
| | for col, epoch in enumerate(epochs_to_show): |
| | ax = axes[row, col] |
| | |
| | if epoch in results[act_type]["activation_history"]: |
| | activations = results[act_type]["activation_history"][epoch].get('layer_5', None) |
| | |
| | if activations is not None: |
| | |
| | acts_clean = activations.flatten() |
| | acts_clean = acts_clean[np.isfinite(acts_clean)] |
| | |
| | if len(acts_clean) > 0: |
| | ax.hist(acts_clean, bins=50, color=colors[act_type], |
| | alpha=0.7, edgecolor='black', linewidth=0.5) |
| | |
| | mean_val = np.nanmean(acts_clean) |
| | std_val = np.nanstd(acts_clean) |
| | |
| | ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5) |
| | |
| | textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}' |
| | props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) |
| | ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8, |
| | verticalalignment='top', horizontalalignment='right', bbox=props) |
| | |
| | if row == 0: |
| | ax.set_title(f'Epoch {epoch}', fontsize=11, fontweight='bold') |
| | if col == 0: |
| | ax.set_ylabel(f'{activation_labels[act_type]}', fontsize=10) |
| |
|
| | fig.suptitle('Activation Distribution Evolution (Layer 5 - Middle Layer)\nHow Activations Change During Training', fontsize=14, y=1.01) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/activation_evolution.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | |
| | print(" Creating training_dynamics_summary.png...") |
| | fig = plt.figure(figsize=(20, 16)) |
| |
|
| | |
| | gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3) |
| |
|
| | |
| | ax1 = fig.add_subplot(gs[0, 0]) |
| | for act_type in activation_types: |
| | ax1.plot(results[act_type]["loss_history"], |
| | color=colors[act_type], linewidth=2, label=activation_labels[act_type]) |
| | ax1.set_xlabel('Epoch', fontsize=11) |
| | ax1.set_ylabel('MSE Loss', fontsize=11) |
| | ax1.set_title('A. Training Loss Curves', fontsize=12, fontweight='bold') |
| | ax1.set_yscale('log') |
| | ax1.legend(loc='upper right', fontsize=8) |
| | ax1.grid(True, alpha=0.3) |
| |
|
| | |
| | ax2 = fig.add_subplot(gs[0, 1]) |
| | for act_type in activation_types: |
| | ratios = [] |
| | for epoch in [1, 100, 200]: |
| | grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10) |
| | if grads[0] > 1e-15: |
| | ratio = grads[9] / grads[0] |
| | else: |
| | ratio = 1e10 |
| | ratios.append(ratio) |
| | ax2.plot([1, 100, 200], ratios, 'o-', color=colors[act_type], |
| | linewidth=2, markersize=8, label=activation_labels[act_type]) |
| | ax2.set_xlabel('Epoch', fontsize=11) |
| | ax2.set_ylabel('Gradient Ratio (L10/L1)', fontsize=11) |
| | ax2.set_title('B. Gradient Ratio Over Training', fontsize=12, fontweight='bold') |
| | ax2.set_yscale('log') |
| | ax2.axhline(y=1, color='black', linestyle='--', linewidth=1, alpha=0.5) |
| | ax2.legend(loc='upper left', fontsize=8) |
| | ax2.grid(True, alpha=0.3) |
| |
|
| | |
| | ax3 = fig.add_subplot(gs[0, 2]) |
| | ax3.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7) |
| | for act_type in activation_types: |
| | ax3.plot(x_eval, results[act_type]["final_predictions"], |
| | color=colors[act_type], linewidth=1.5, label=activation_labels[act_type], alpha=0.8) |
| | ax3.set_xlabel('x', fontsize=11) |
| | ax3.set_ylabel('y', fontsize=11) |
| | ax3.set_title('C. Final Learned Functions', fontsize=12, fontweight='bold') |
| | ax3.legend(loc='upper right', fontsize=8) |
| | ax3.grid(True, alpha=0.3) |
| |
|
| | |
| | for idx, epoch in enumerate([1, 100, 200]): |
| | ax = fig.add_subplot(gs[1, idx]) |
| | bar_width = 0.15 |
| | x_positions = np.arange(10) |
| | |
| | for i, act_type in enumerate(activation_types): |
| | grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10) |
| | offset = (i - 2) * bar_width |
| | ax.bar(x_positions + offset, grad_mags, bar_width, |
| | color=colors[act_type], alpha=0.8) |
| | |
| | ax.set_xlabel('Layer', fontsize=10) |
| | ax.set_ylabel('Gradient Magnitude', fontsize=10) |
| | ax.set_title(f'D{idx+1}. Gradient Flow - Epoch {epoch}', fontsize=12, fontweight='bold') |
| | ax.set_xticks(x_positions) |
| | ax.set_xticklabels([f'{i+1}' for i in range(10)], fontsize=8) |
| | ax.set_yscale('log') |
| | ax.set_ylim(1e-12, 1e0) |
| | ax.grid(True, alpha=0.3, axis='y') |
| |
|
| | |
| | for idx, epoch in enumerate([50, 200, 499]): |
| | ax = fig.add_subplot(gs[2, idx]) |
| | ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7) |
| | |
| | for act_type in activation_types: |
| | if epoch in results[act_type]["prediction_history"]: |
| | pred = results[act_type]["prediction_history"][epoch] |
| | ax.plot(x_eval, pred, color=colors[act_type], linewidth=1.5, |
| | label=activation_labels[act_type], alpha=0.8) |
| | |
| | ax.set_xlabel('x', fontsize=10) |
| | ax.set_ylabel('y', fontsize=10) |
| | ax.set_title(f'E{idx+1}. Predictions at Epoch {epoch}', fontsize=12, fontweight='bold') |
| | ax.set_xlim(-np.pi, np.pi) |
| | ax.set_ylim(-2, 2) |
| | ax.grid(True, alpha=0.3) |
| | if idx == 2: |
| | ax.legend(loc='upper right', fontsize=7) |
| |
|
| | fig.suptitle('Comprehensive Training Dynamics Analysis: Activation Functions in Deep Networks\n(10 Layers × 64 Neurons, 500 Epochs, Adam Optimizer)', fontsize=16, y=1.01) |
| | plt.savefig('activation_functions/training_dynamics_summary.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| |
|
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All visualizations saved!") |
| | print(" - learned_functions.png") |
| | print(" - loss_curves.png") |
| | print(" - gradient_flow.png") |
| | print(" - gradient_flow_epochs.png (NEW)") |
| | print(" - hidden_activations.png") |
| | print(" - training_dynamics_functions.png (NEW)") |
| | print(" - gradient_evolution.png (NEW)") |
| | print(" - activation_evolution.png (NEW)") |
| | print(" - training_dynamics_summary.png (NEW)") |
| |
|
| |
|
| | |
| | |
| | |
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Summary Statistics") |
| | print("=" * 70) |
| |
|
| | print("\n### Gradient Magnitudes at Key Epochs ###") |
| | print("-" * 70) |
| | print(f"{'Activation':<15} {'Epoch':<8} {'Layer 1':<12} {'Layer 5':<12} {'Layer 10':<12} {'Ratio (L10/L1)':<15}") |
| | print("-" * 70) |
| |
|
| | for act_type in activation_types: |
| | for epoch in [1, 100, 200]: |
| | grads = results[act_type]["gradient_history"].get(epoch, [0]*10) |
| | if grads[0] > 1e-15: |
| | ratio = grads[9] / grads[0] |
| | else: |
| | ratio = float('inf') |
| | print(f"{activation_labels[act_type]:<15} {epoch:<8} {grads[0]:<12.2e} {grads[4]:<12.2e} {grads[9]:<12.2e} {ratio:<15.2e}") |
| |
|
| | print("\n### Final MSE Losses ###") |
| | print("-" * 40) |
| | sorted_losses = sorted(final_losses.items(), key=lambda x: x[1]) |
| | for act_type, loss in sorted_losses: |
| | print(f"{activation_labels[act_type]:<20}: {loss:.6f}") |
| |
|
| | print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Experiment complete!") |
| | print("=" * 70) |
| |
|