Spaces:
Running
Running
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Complete Deep Learning & Computer Vision Curriculum</title> | |
| <!-- Shared Design System --> | |
| <link rel="stylesheet" href="../shared/css/design-system.css"> | |
| <link rel="stylesheet" href="../shared/css/components.css"> | |
| <style> | |
| * { | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| } | |
| :root { | |
| --bg: #0f1419; | |
| --surface: #1a1f2e; | |
| --text: #e4e6eb; | |
| --text-dim: #b0b7c3; | |
| --cyan: #00d4ff; | |
| --orange: #ff6b35; | |
| --green: #00ff88; | |
| --yellow: #ffa500; | |
| } | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; | |
| background: var(--bg); | |
| color: var(--text); | |
| line-height: 1.6; | |
| overflow-x: hidden; | |
| } | |
| .container { | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| } | |
| header { | |
| text-align: center; | |
| margin-bottom: 40px; | |
| padding: 30px 0; | |
| border-bottom: 2px solid var(--cyan); | |
| } | |
| h1 { | |
| font-size: 2.5em; | |
| background: linear-gradient(135deg, var(--cyan), var(--orange)); | |
| background-clip: text; | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 10px; | |
| } | |
| .subtitle { | |
| color: var(--text-dim); | |
| font-size: 1.1em; | |
| } | |
| .dashboard { | |
| display: none; | |
| } | |
| .dashboard.active { | |
| display: block; | |
| } | |
| .grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); | |
| gap: 25px; | |
| margin: 40px 0; | |
| } | |
| .card { | |
| background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(255, 107, 53, 0.1)); | |
| border: 2px solid var(--cyan); | |
| border-radius: 12px; | |
| padding: 30px; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| text-align: center; | |
| } | |
| .card:hover { | |
| transform: translateY(-5px); | |
| box-shadow: 0 10px 30px rgba(0, 212, 255, 0.2); | |
| border-color: var(--orange); | |
| } | |
| .card-icon { | |
| font-size: 3em; | |
| margin-bottom: 15px; | |
| } | |
| .card h3 { | |
| color: var(--cyan); | |
| font-size: 1.5em; | |
| margin-bottom: 10px; | |
| } | |
| .card p { | |
| color: var(--text-dim); | |
| font-size: 0.95em; | |
| } | |
| .category-label { | |
| display: inline-block; | |
| margin-top: 10px; | |
| padding: 5px 12px; | |
| background: rgba(0, 212, 255, 0.2); | |
| border-radius: 20px; | |
| font-size: 0.85em; | |
| color: var(--green); | |
| } | |
| .module { | |
| display: none; | |
| } | |
| .module.active { | |
| display: block; | |
| animation: fadeIn 0.3s ease; | |
| } | |
| @keyframes fadeIn { | |
| from { | |
| opacity: 0; | |
| } | |
| to { | |
| opacity: 1; | |
| } | |
| } | |
| .btn-back { | |
| padding: 10px 20px; | |
| background: var(--orange); | |
| color: var(--bg); | |
| border: none; | |
| border-radius: 6px; | |
| cursor: pointer; | |
| font-weight: 600; | |
| margin-bottom: 25px; | |
| transition: all 0.3s ease; | |
| } | |
| .btn-back:hover { | |
| background: var(--cyan); | |
| } | |
| .tabs { | |
| display: flex; | |
| gap: 10px; | |
| margin-bottom: 30px; | |
| flex-wrap: wrap; | |
| justify-content: center; | |
| border-bottom: 1px solid rgba(0, 212, 255, 0.2); | |
| padding-bottom: 15px; | |
| overflow-x: auto; | |
| } | |
| .tab-btn { | |
| padding: 10px 20px; | |
| background: var(--surface); | |
| color: var(--text); | |
| border: 2px solid transparent; | |
| border-radius: 6px; | |
| cursor: pointer; | |
| font-size: 0.95em; | |
| transition: all 0.3s ease; | |
| font-weight: 500; | |
| white-space: nowrap; | |
| } | |
| .tab-btn:hover { | |
| background: rgba(0, 212, 255, 0.1); | |
| border-color: var(--cyan); | |
| } | |
| .tab-btn.active { | |
| background: var(--cyan); | |
| color: var(--bg); | |
| border-color: var(--cyan); | |
| } | |
| .tab { | |
| display: none; | |
| } | |
| .tab.active { | |
| display: block; | |
| animation: fadeIn 0.3s ease; | |
| } | |
| .section { | |
| background: var(--surface); | |
| border: 1px solid rgba(0, 212, 255, 0.2); | |
| border-radius: 10px; | |
| padding: 30px; | |
| margin-bottom: 25px; | |
| transition: all 0.3s ease; | |
| } | |
| .section:hover { | |
| border-color: var(--cyan); | |
| box-shadow: 0 0 20px rgba(0, 212, 255, 0.1); | |
| } | |
| h2 { | |
| color: var(--cyan); | |
| font-size: 1.8em; | |
| margin-bottom: 15px; | |
| } | |
| h3 { | |
| color: var(--orange); | |
| font-size: 1.3em; | |
| margin-top: 20px; | |
| margin-bottom: 12px; | |
| } | |
| h4 { | |
| color: var(--green); | |
| font-size: 1.1em; | |
| margin-top: 15px; | |
| margin-bottom: 10px; | |
| } | |
| p { | |
| margin-bottom: 15px; | |
| line-height: 1.8; | |
| } | |
| ul { | |
| margin-left: 20px; | |
| margin-bottom: 15px; | |
| } | |
| ul li { | |
| margin-bottom: 8px; | |
| } | |
| .info-box { | |
| background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(255, 107, 53, 0.1)); | |
| border: 1px solid var(--cyan); | |
| border-radius: 8px; | |
| padding: 20px; | |
| margin: 20px 0; | |
| } | |
| .box-title { | |
| color: var(--orange); | |
| font-weight: 700; | |
| margin-bottom: 10px; | |
| font-size: 1.1em; | |
| } | |
| .box-content { | |
| color: var(--text-dim); | |
| line-height: 1.7; | |
| } | |
| .formula { | |
| background: rgba(0, 212, 255, 0.1); | |
| border: 1px solid var(--cyan); | |
| border-radius: 8px; | |
| padding: 20px; | |
| margin: 20px 0; | |
| font-family: 'Courier New', monospace; | |
| overflow-x: auto; | |
| line-height: 1.8; | |
| color: var(--cyan); | |
| } | |
| .callout { | |
| border-left: 4px solid; | |
| padding: 15px; | |
| margin: 20px 0; | |
| border-radius: 6px; | |
| } | |
| .callout.tip { | |
| border-left-color: var(--green); | |
| background: rgba(0, 255, 136, 0.05); | |
| } | |
| .callout.warning { | |
| border-left-color: var(--yellow); | |
| background: rgba(255, 165, 0, 0.05); | |
| } | |
| .callout.insight { | |
| border-left-color: var(--cyan); | |
| background: rgba(0, 212, 255, 0.05); | |
| } | |
| .callout-title { | |
| font-weight: 700; | |
| margin-bottom: 8px; | |
| } | |
| .list-item { | |
| display: flex; | |
| gap: 12px; | |
| margin: 12px 0; | |
| padding: 12px; | |
| background: rgba(0, 212, 255, 0.05); | |
| border-left: 3px solid var(--cyan); | |
| border-radius: 4px; | |
| } | |
| .list-num { | |
| color: var(--orange); | |
| font-weight: 700; | |
| min-width: 30px; | |
| } | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 20px 0; | |
| } | |
| th, | |
| td { | |
| padding: 12px; | |
| text-align: left; | |
| border: 1px solid rgba(0, 212, 255, 0.2); | |
| } | |
| th { | |
| background: rgba(0, 212, 255, 0.1); | |
| color: var(--cyan); | |
| font-weight: 700; | |
| } | |
| .viz-container { | |
| background: rgba(0, 212, 255, 0.02); | |
| border: 1px solid rgba(0, 212, 255, 0.2); | |
| border-radius: 8px; | |
| padding: 20px; | |
| margin: 20px 0; | |
| display: flex; | |
| justify-content: center; | |
| overflow-x: auto; | |
| } | |
| .viz-controls { | |
| display: flex; | |
| gap: 10px; | |
| margin-top: 20px; | |
| justify-content: center; | |
| flex-wrap: wrap; | |
| } | |
| .btn-viz { | |
| padding: 10px 20px; | |
| background: var(--cyan); | |
| color: var(--bg); | |
| border: none; | |
| border-radius: 6px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| font-size: 0.95em; | |
| transition: all 0.3s ease; | |
| } | |
| .btn-viz:hover { | |
| background: var(--orange); | |
| transform: scale(1.05); | |
| } | |
| canvas { | |
| max-width: 100%; | |
| height: auto; | |
| } | |
| @media (max-width: 768px) { | |
| h1 { | |
| font-size: 1.8em; | |
| } | |
| .tabs { | |
| flex-direction: column; | |
| } | |
| .tab-btn { | |
| width: 100%; | |
| } | |
| .grid { | |
| grid-template-columns: 1fr; | |
| } | |
| canvas { | |
| width: 100% !important; | |
| height: auto !important; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <!-- MAIN DASHBOARD --> | |
| <div id="dashboard" class="dashboard active"> | |
| <header> | |
| <h1>๐ง Complete Deep Learning & Computer Vision</h1> | |
| <p class="subtitle">Comprehensive Curriculum | Foundations to Advanced Applications</p> | |
| </header> | |
| <div style="text-align: center; margin-bottom: 40px;"> | |
| <p style="color: var(--text-dim); font-size: 1.1em;"> | |
| Master all aspects of deep learning and computer vision. 25+ modules covering neural networks, CNNs, | |
| object detection, GANs, and more. | |
| </p> | |
| </div> | |
| <div class="grid" id="modulesGrid"></div> | |
| </div> | |
| <!-- MODULES CONTAINER --> | |
| <div id="modulesContainer"></div> | |
| </div> | |
| <script> | |
| const modules = [ | |
| // Module 1: Deep Learning Foundations | |
| { | |
| id: "nn-basics", | |
| title: "Introduction to Neural Networks", | |
| icon: "๐งฌ", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "Biological vs. Artificial neurons and network architecture" | |
| }, | |
| { | |
| id: "perceptron", | |
| title: "The Perceptron", | |
| icon: "โ๏ธ", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "Single layer networks and their limitations" | |
| }, | |
| { | |
| id: "mlp", | |
| title: "Multi-Layer Perceptron (MLP)", | |
| icon: "๐๏ธ", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "Hidden layers and deep architectures" | |
| }, | |
| { | |
| id: "activation", | |
| title: "Activation Functions", | |
| icon: "โก", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "Sigmoid, ReLU, Tanh, Leaky ReLU, ELU, Softmax" | |
| }, | |
| { | |
| id: "weight-init", | |
| title: "Weight Initialization", | |
| icon: "๐ฏ", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "Xavier, He, Random initialization strategies" | |
| }, | |
| { | |
| id: "loss", | |
| title: "Loss Functions", | |
| icon: "๐", | |
| category: "Foundations", | |
| color: "#0088ff", | |
| description: "MSE, Binary Cross-Entropy, Categorical Cross-Entropy" | |
| }, | |
| { | |
| id: "optimizers", | |
| title: "Optimizers", | |
| icon: "๐ฏ", | |
| category: "Training", | |
| color: "#00ff00", | |
| description: "SGD, Momentum, Adam, Adagrad, RMSprop" | |
| }, | |
| { | |
| id: "backprop", | |
| title: "Forward & Backpropagation", | |
| icon: "โฌ ๏ธ", | |
| category: "Training", | |
| color: "#00ff00", | |
| description: "Chain rule and gradient computation" | |
| }, | |
| { | |
| id: "regularization", | |
| title: "Regularization", | |
| icon: "๐ก๏ธ", | |
| category: "Training", | |
| color: "#00ff00", | |
| description: "L1/L2, Dropout, Early Stopping, Batch Norm" | |
| }, | |
| { | |
| id: "batch-norm", | |
| title: "Batch Normalization", | |
| icon: "โ๏ธ", | |
| category: "Training", | |
| color: "#00ff00", | |
| description: "Stabilizing and speeding up training" | |
| }, | |
| // Module 2: Computer Vision Fundamentals | |
| { | |
| id: "cv-intro", | |
| title: "CV Fundamentals", | |
| icon: "๐๏ธ", | |
| category: "Computer Vision", | |
| color: "#ff6b35", | |
| description: "Why ANNs fail with images, parameter explosion" | |
| }, | |
| { | |
| id: "conv-layer", | |
| title: "Convolutional Layers", | |
| icon: "๐ผ๏ธ", | |
| category: "Computer Vision", | |
| color: "#ff6b35", | |
| description: "Kernels, filters, feature maps, stride, padding" | |
| }, | |
| { | |
| id: "pooling", | |
| title: "Pooling Layers", | |
| icon: "๐ฆ", | |
| category: "Computer Vision", | |
| color: "#ff6b35", | |
| description: "Max pooling, average pooling, spatial reduction" | |
| }, | |
| { | |
| id: "cnn-basics", | |
| title: "CNN Architecture", | |
| icon: "๐๏ธ", | |
| category: "Computer Vision", | |
| color: "#ff6b35", | |
| description: "Combining conv, pooling, and fully connected layers" | |
| }, | |
| { | |
| id: "viz-filters", | |
| title: "Visualizing CNNs", | |
| icon: "๐", | |
| category: "Computer Vision", | |
| color: "#ff6b35", | |
| description: "What filters learn: edges โ shapes โ objects" | |
| }, | |
| // Module 3: Advanced CNN Architectures | |
| { | |
| id: "lenet", | |
| title: "LeNet-5", | |
| icon: "๐ข", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "Classic digit recognizer (MNIST)" | |
| }, | |
| { | |
| id: "alexnet", | |
| title: "AlexNet", | |
| icon: "๐", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "The breakthrough in deep computer vision (2012)" | |
| }, | |
| { | |
| id: "vgg", | |
| title: "VGGNet", | |
| icon: "๐", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "VGG-16/19: Deep networks with small filters" | |
| }, | |
| { | |
| id: "resnet", | |
| title: "ResNet", | |
| icon: "๐", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "Skip connections, solving vanishing gradients" | |
| }, | |
| { | |
| id: "inception", | |
| title: "InceptionNet (GoogLeNet)", | |
| icon: "๐ฏ", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "1x1 convolutions, multi-scale feature extraction" | |
| }, | |
| { | |
| id: "mobilenet", | |
| title: "MobileNet", | |
| icon: "๐ฑ", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "Depth-wise separable convolutions for efficiency" | |
| }, | |
| { | |
| id: "transfer-learning", | |
| title: "Transfer Learning", | |
| icon: "๐", | |
| category: "CNN Architectures", | |
| color: "#ff00ff", | |
| description: "Fine-tuning and leveraging pre-trained models" | |
| }, | |
| // Module 4: Object Detection & Segmentation | |
| { | |
| id: "localization", | |
| title: "Object Localization", | |
| icon: "๐", | |
| category: "Detection", | |
| color: "#00ff00", | |
| description: "Bounding boxes and classification together" | |
| }, | |
| { | |
| id: "rcnn", | |
| title: "R-CNN Family", | |
| icon: "๐ฏ", | |
| category: "Detection", | |
| color: "#00ff00", | |
| description: "R-CNN, Fast R-CNN, Faster R-CNN" | |
| }, | |
| { | |
| id: "yolo", | |
| title: "YOLO", | |
| icon: "โก", | |
| category: "Detection", | |
| color: "#00ff00", | |
| description: "Real-time object detection (v3, v5, v8)" | |
| }, | |
| { | |
| id: "ssd", | |
| title: "SSD", | |
| icon: "๐", | |
| category: "Detection", | |
| color: "#00ff00", | |
| description: "Single Shot MultiBox Detector" | |
| }, | |
| { | |
| id: "semantic-seg", | |
| title: "Semantic Segmentation", | |
| icon: "๐๏ธ", | |
| category: "Segmentation", | |
| color: "#00ff00", | |
| description: "Pixel-level classification (U-Net)" | |
| }, | |
| { | |
| id: "instance-seg", | |
| title: "Instance Segmentation", | |
| icon: "๐ฅ", | |
| category: "Segmentation", | |
| color: "#00ff00", | |
| description: "Mask R-CNN and separate object instances" | |
| }, | |
| { | |
| id: "face-recog", | |
| title: "Face Recognition", | |
| icon: "๐ค", | |
| category: "Segmentation", | |
| color: "#00ff00", | |
| description: "Siamese networks and triplet loss" | |
| }, | |
| // Module 5: Generative Models | |
| { | |
| id: "autoencoders", | |
| title: "Autoencoders", | |
| icon: "๐", | |
| category: "Generative", | |
| color: "#ffaa00", | |
| description: "Encoder-decoder, latent space, denoising" | |
| }, | |
| { | |
| id: "gans", | |
| title: "GANs (Generative Adversarial Networks)", | |
| icon: "๐ฎ", | |
| category: "Generative", | |
| color: "#ffaa00", | |
| description: "Generator vs. Discriminator, DCGAN" | |
| }, | |
| { | |
| id: "diffusion", | |
| title: "Diffusion Models", | |
| icon: "๐", | |
| category: "Generative", | |
| color: "#ffaa00", | |
| description: "Foundation of Stable Diffusion and DALL-E" | |
| }, | |
| // Additional Advanced Topics | |
| { | |
| id: "rnn", | |
| title: "RNNs & LSTMs", | |
| icon: "๐", | |
| category: "Sequence", | |
| color: "#ff6b35", | |
| description: "Recurrent networks for sequential data" | |
| }, | |
| { | |
| id: "transformers", | |
| title: "Transformers", | |
| icon: "๐", | |
| category: "Sequence", | |
| color: "#ff6b35", | |
| description: "\"Attention Is All You Need\" - Complete paper breakdown with math" | |
| }, | |
| { | |
| id: "bert", | |
| title: "BERT & NLP Transformers", | |
| icon: "๐", | |
| category: "NLP", | |
| color: "#ff6b35", | |
| description: "Bidirectional transformers for language" | |
| }, | |
| { | |
| id: "gpt", | |
| title: "GPT & Language Models", | |
| icon: "๐ฌ", | |
| category: "NLP", | |
| color: "#ff6b35", | |
| description: "Autoregressive models and text generation" | |
| }, | |
| { | |
| id: "vit", | |
| title: "Vision Transformers (ViT)", | |
| icon: "๐จ", | |
| category: "Vision", | |
| color: "#ff6b35", | |
| description: "Transformers applied to image data" | |
| }, | |
| { | |
| id: "gnn", | |
| title: "Graph Neural Networks", | |
| icon: "๐ธ๏ธ", | |
| category: "Advanced", | |
| color: "#9900ff", | |
| description: "Deep learning on non-Euclidean graph data" | |
| }, | |
| { | |
| id: "seq2seq", | |
| title: "Seq2Seq & Attention", | |
| icon: "โก๏ธ", | |
| category: "NLP", | |
| color: "#ff6b35", | |
| description: "Encoder-Decoder models and Attention mechanics" | |
| }, | |
| { | |
| id: "research-papers", | |
| title: "Research Library", | |
| icon: "๐", | |
| category: "Advanced", | |
| color: "#9900ff", | |
| description: "Curated collection of seminal deep learning papers" | |
| } | |
| ]; | |
| // Comprehensive content for all modules | |
| const MODULE_CONTENT = { | |
| "nn-basics": { | |
| overview: ` | |
| <h3>What are Neural Networks?</h3> | |
| <p>Neural Networks are computational models inspired by the human brain's structure. They consist of interconnected nodes (neurons) organized in layers that process information through weighted connections.</p> | |
| <h3>Why Use Neural Networks?</h3> | |
| <ul> | |
| <li><strong>Universal Approximation:</strong> Can theoretically approximate any continuous function</li> | |
| <li><strong>Feature Learning:</strong> Automatically discover representations from raw data</li> | |
| <li><strong>Adaptability:</strong> Learn from examples without explicit programming</li> | |
| <li><strong>Parallel Processing:</strong> Highly parallelizable for modern hardware</li> | |
| </ul> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Advantages</div> | |
| โข Non-linear problem solving<br> | |
| โข Robust to noisy data<br> | |
| โข Works with incomplete information<br> | |
| โข Continuous learning capability | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Disadvantages</div> | |
| โข Requires large amounts of training data<br> | |
| โข Computationally expensive<br> | |
| โข "Black box" - difficult to interpret<br> | |
| โข Prone to overfitting without regularization | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Core Components</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Neurons (Nodes):</strong> Basic computational units that receive inputs, apply weights, add bias, and apply activation function</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Layers:</strong> Input layer (receives data), Hidden layers (feature extraction), Output layer (predictions)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Weights:</strong> Parameters learned during training that determine connection strength</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Bias:</strong> Allows shifting the activation function for better fitting</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div><strong>Activation Function:</strong> Introduces non-linearity (ReLU, Sigmoid, Tanh)</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <h3>Real-World Applications</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Healthcare</div> | |
| <div class="box-content">Disease diagnosis, medical image analysis, drug discovery, patient risk prediction</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฐ Finance</div> | |
| <div class="box-content">Fraud detection, algorithmic trading, credit scoring, portfolio optimization</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ E-commerce</div> | |
| <div class="box-content">Recommendation systems, demand forecasting, customer segmentation, price optimization</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Fundamental Equations of a Neuron</h3> | |
| <p>A single neuron performs a weighted sum followed by an activation function. This is the atomic building block of all neural networks.</p> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 25px; border-radius: 8px;"> | |
| <strong>z = ฮฃ(wแตขxแตข) + b = wโxโ + wโxโ + ... + wโxโ + b</strong><br> | |
| <strong>a = ฯ(z)</strong> | |
| </div> | |
| <h4>Step-by-Step: Single Neuron Forward Pass</h4> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div> | |
| <strong>Weighted Sum (Linear):</strong><br> | |
| z = wแตx + b = ฮฃแตข wแตขxแตข + b<br> | |
| <span class="formula-caption">This is a dot product plus bias - pure linear algebra</span> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div> | |
| <strong>Activation (Non-Linear):</strong><br> | |
| a = ฯ(z) where ฯ can be ReLU, Sigmoid, Tanh, etc.<br> | |
| <span class="formula-caption">This introduces non-linearity, enabling complex functions</span> | |
| </div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Manual Forward Pass</div> | |
| <strong>Inputs:</strong> x = [2, 3], <strong>Weights:</strong> w = [0.5, -0.3], <strong>Bias:</strong> b = 0.1<br><br> | |
| <strong>Step 1 - Weighted Sum:</strong><br> | |
| z = (0.5 ร 2) + (-0.3 ร 3) + 0.1<br> | |
| z = 1.0 - 0.9 + 0.1 = <strong>0.2</strong><br><br> | |
| <strong>Step 2 - ReLU Activation:</strong><br> | |
| a = max(0, 0.2) = <strong>0.2</strong><br><br> | |
| <strong>Step 2 (alt) - Sigmoid Activation:</strong><br> | |
| a = 1 / (1 + eโปโฐยทยฒ) = 1 / 1.819 โ <strong>0.55</strong> | |
| </div> | |
| <h3>Network Layer in Matrix Form</h3> | |
| <p>For a layer with n inputs and m neurons, we use matrices for efficiency:</p> | |
| <div class="formula"> | |
| <strong>Z = WX + b</strong><br> | |
| <strong>A = ฯ(Z)</strong><br><br> | |
| Where:<br> | |
| โข W โ โแตหฃโฟ (weight matrix: m neurons, n inputs)<br> | |
| โข X โ โโฟหฃยน (input vector)<br> | |
| โข b โ โแตหฃยน (bias vector)<br> | |
| โข Z โ โแตหฃยน (pre-activation)<br> | |
| โข A โ โแตหฃยน (activation output) | |
| </div> | |
| <h3>Parameter Count Formula</h3> | |
| <div class="formula"> | |
| For a layer: n_in โ n_out<br> | |
| <strong>Parameters = n_in ร n_out + n_out</strong><br> | |
| (weights) + (biases)<br><br> | |
| Example: Layer 784 โ 128<br> | |
| Params = 784 ร 128 + 128 = 100,480 | |
| </div> | |
| ` | |
| }, | |
| "activation": { | |
| overview: ` | |
| <h3>What are Activation Functions?</h3> | |
| <p>Activation functions introduce non-linearity into neural networks, enabling them to learn complex patterns. Without activation functions, a neural network would be just a linear regression model regardless of depth.</p> | |
| <h3>Why Do We Need Them?</h3> | |
| <ul> | |
| <li><strong>Non-linearity:</strong> Real-world problems are rarely linear</li> | |
| <li><strong>Complex Pattern Learning:</strong> Enable learning of intricate decision boundaries</li> | |
| <li><strong>Gradient Flow:</strong> Control how gradients propagate during backpropagation</li> | |
| <li><strong>Range Normalization:</strong> Keep activations in manageable ranges</li> | |
| </ul> | |
| <h3>Common Activation Functions Comparison</h3> | |
| <table> | |
| <tr> | |
| <th>Function</th> | |
| <th>Range</th> | |
| <th>Best Use</th> | |
| <th>Issue</th> | |
| </tr> | |
| <tr> | |
| <td>ReLU</td> | |
| <td>[0, โ)</td> | |
| <td>Hidden layers (default)</td> | |
| <td>Dying ReLU problem</td> | |
| </tr> | |
| <tr> | |
| <td>Sigmoid</td> | |
| <td>(0, 1)</td> | |
| <td>Binary classification output</td> | |
| <td>Vanishing gradients</td> | |
| </tr> | |
| <tr> | |
| <td>Tanh</td> | |
| <td>(-1, 1)</td> | |
| <td>RNNs, zero-centered</td> | |
| <td>Vanishing gradients</td> | |
| </tr> | |
| <tr> | |
| <td>Leaky ReLU</td> | |
| <td>(-โ, โ)</td> | |
| <td>Fixes dying ReLU</td> | |
| <td>Extra hyperparameter</td> | |
| </tr> | |
| <tr> | |
| <td>Softmax</td> | |
| <td>(0, 1) sum=1</td> | |
| <td>Multi-class output</td> | |
| <td>Computationally expensive</td> | |
| </tr> | |
| <tr> | |
| <td>GELU</td> | |
| <td>(-0.17, โ)</td> | |
| <td>Transformers (BERT, GPT)</td> | |
| <td>Computationally expensive</td> | |
| </tr> | |
| <tr> | |
| <td>Swish</td> | |
| <td>(-0.28, โ)</td> | |
| <td>Deep networks (40+ layers)</td> | |
| <td>Slightly slower than ReLU</td> | |
| </tr> | |
| </table> | |
| `, | |
| concepts: ` | |
| <h3>Key Properties</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Differentiability:</strong> Must have derivatives for backpropagation to work</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Monotonicity:</strong> Preferably monotonic for easier optimization</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Zero-Centered:</strong> Helps with faster convergence (Tanh)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Computational Efficiency:</strong> Should be fast to compute (ReLU wins)</div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Best Practices</div> | |
| โข Use <strong>ReLU</strong> for hidden layers by default<br> | |
| โข Use <strong>Sigmoid</strong> for binary classification output<br> | |
| โข Use <strong>Softmax</strong> for multi-class classification<br> | |
| โข Try <strong>Leaky ReLU</strong> or <strong>ELU</strong> if ReLU neurons are dying<br> | |
| โข Avoid Sigmoid/Tanh in deep networks (gradient vanishing) | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Dead Neurons (Dying ReLU Problem)</div> | |
| When a neuron's input is always negative, ReLU outputs 0 and its gradient is 0.<br> | |
| The neuron <strong>never updates</strong> โ it's permanently "dead".<br><br> | |
| <strong>Solutions:</strong><br> | |
| โข Use <strong>Leaky ReLU</strong> (small slope for negative values)<br> | |
| โข Use <strong>ELU</strong> (exponential for negative values)<br> | |
| โข Careful weight initialization (He Initialization) | |
| </div> | |
| <h3>GELU (Gaussian Error Linear Unit)</h3> | |
| <p>Used in <strong>Transformers, BERT, and GPT</strong>. GELU multiplies the input by the probability that it's positive under a Gaussian distribution.</p> | |
| <div class="formula"> | |
| GELU(x) = x ร ฮฆ(x) โ 0.5x(1 + tanh(โ(2/ฯ)(x + 0.044715xยณ))) | |
| </div> | |
| <h3>Swish (Self-Gated Activation)</h3> | |
| <p>Developed by Google researchers. Consistently matches or outperforms ReLU on deep networks.</p> | |
| <div class="formula"> | |
| Swish(x) = x ร ฯ(ฮฒx) where ฯ = Sigmoid, ฮฒ = learnable parameter | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Why Swish is Better</div> | |
| โข <strong>Smooth:</strong> Doesn't abruptly change direction like ReLU at x=0<br> | |
| โข <strong>Non-monotonous:</strong> Small negative values preserved (not zeroed like ReLU)<br> | |
| โข <strong>Unbounded above, bounded below:</strong> Best of both worlds<br> | |
| โข Best for networks with depth > 40 layers | |
| </div> | |
| <h3>How to Choose Activation Functions</h3> | |
| <table> | |
| <tr><th>Layer / Task</th><th>Recommended</th></tr> | |
| <tr><td>Hidden layers (default)</td><td>ReLU</td></tr> | |
| <tr><td>Regression output</td><td>Linear (no activation)</td></tr> | |
| <tr><td>Binary classification output</td><td>Sigmoid</td></tr> | |
| <tr><td>Multi-class classification</td><td>Softmax</td></tr> | |
| <tr><td>Multi-label classification</td><td>Sigmoid</td></tr> | |
| <tr><td>CNN hidden layers</td><td>ReLU</td></tr> | |
| <tr><td>RNN hidden layers</td><td>Tanh / Sigmoid</td></tr> | |
| <tr><td>Transformers</td><td>GELU</td></tr> | |
| <tr><td>Deep networks (40+ layers)</td><td>Swish</td></tr> | |
| </table> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ง Neural Network Design</div> | |
| <div class="box-content"> | |
| Critical choice for every neural network - affects training speed, convergence, and final accuracy | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Task-Specific Selection</div> | |
| <div class="box-content"> | |
| Different tasks need different outputs: Sigmoid for binary, Softmax for multi-class, Linear for regression | |
| </div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ค Probable Interview Questions</div> | |
| 1. Why do we need activation functions?<br> | |
| 2. What is vanishing gradient?<br> | |
| 3. Why is ReLU preferred over sigmoid?<br> | |
| 4. What are dead neurons?<br> | |
| 5. Difference between ReLU and Leaky ReLU?<br> | |
| 6. Why softmax instead of sigmoid for multiclass?<br> | |
| 7. Why linear activation for regression output?<br> | |
| 8. Why GELU is used in transformers?<br> | |
| 9. Can activation function affect convergence speed?<br> | |
| 10. What happens if we remove activation functions? | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Derivatives: The Backprop Fuel</h3> | |
| <p>Activation functions must be differentiable for backpropagation to work. Let's look at the derivatives on paper:</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Sigmoid:</strong> ฯ(z) = 1 / (1 + eโปแถป)<br> | |
| <strong>Derivative:</strong> ฯ'(z) = ฯ(z)(1 - ฯ(z))<br> | |
| <span class="formula-caption">Max gradient is 0.25 (at z=0). This is why deep networks vanish!</span></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Tanh:</strong> tanh(z) = (eแถป - eโปแถป) / (eแถป + eโปแถป)<br> | |
| <strong>Derivative:</strong> tanh'(z) = 1 - tanhยฒ(z)<br> | |
| <span class="formula-caption">Max gradient is 1.0 (at z=0). Better than Sigmoid, but still vanishes.</span></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>ReLU:</strong> max(0, z)<br> | |
| <strong>Derivative:</strong> 1 if z > 0, else 0<br> | |
| <span class="formula-caption">Gradient is 1.0 for all positive z. No vanishing! But 0 for negative (Dying ReLU).</span></div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: The Chain Effect</div> | |
| Each layer multiplies the gradient by ฯ'(z). <br> | |
| For 10 Sigmoid layers: Total gradient โ (0.25)ยนโฐ โ <strong>0.00000095</strong><br> | |
| This is the mathematical proof of the Vanishing Gradient Problem! | |
| </div> | |
| ` | |
| }, | |
| "conv-layer": { | |
| overview: ` | |
| <h3>What are Convolutional Layers?</h3> | |
| <p>Convolutional layers are the fundamental building blocks of CNNs. They apply learnable filters (kernels) across input data to detect local patterns like edges, textures, and shapes.</p> | |
| <h3>Why Use Convolutions Instead of Fully Connected Layers?</h3> | |
| <ul> | |
| <li><strong>Parameter Efficiency:</strong> Share weights across spatial locations (fewer parameters)</li> | |
| <li><strong>Translation Invariance:</strong> Detect features regardless of position</li> | |
| <li><strong>Local Connectivity:</strong> Each neuron sees | |
| only a small region (receptive field)</li> | |
| <li><strong>Hierarchical Learning:</strong> Build complex features from simple ones</li> | |
| </ul> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Example: Parameter Comparison</div> | |
| For a 224ร224 RGB image:<br> | |
| โข <strong>Fully Connected:</strong> 224 ร 224 ร 3 ร 1000 = 150M parameters (for 1000 neurons)<br> | |
| โข <strong>Convolutional (3ร3):</strong> 3 ร 3 ร 3 ร 64 = 1,728 parameters (for 64 filters)<br> | |
| <strong>Result:</strong> 87,000x fewer parameters! ๐ | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Advantages</div> | |
| โข Drastically reduced parameters<br> | |
| โข Spatial hierarchy (edges โ textures โ parts โ objects)<br> | |
| โข GPU-friendly (highly parallelizable)<br> | |
| โข Built-in translation equivariance | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Disadvantages</div> | |
| โข Not rotation invariant (require data augmentation)<br> | |
| โข Fixed receptive field size<br> | |
| โข Memory intensive during training<br> | |
| โข Require careful hyperparameter tuning (kernel size, stride, padding) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Key Hyperparameters</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Kernel/Filter Size:</strong> Typically 3ร3 or 5ร5. Smaller = more layers needed, larger = more parameters</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Stride:</strong> Step size when sliding filter. Stride=1 (preserves size), Stride=2 (downsamples by 2ร)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Padding:</strong> Add zeros around borders. 'SAME' keeps size, 'VALID' shrinks output</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Number of Filters:</strong> Each filter learns different features. More filters = more capacity but slower</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div><strong>Dilation:</strong> Spacing between kernel elements. Increases receptive field without adding parameters</div> | |
| </div> | |
| <div class="formula"> | |
| Output Size Formula:<br> | |
| W_out = floor((W_in + 2รpadding - kernel_size) / stride) + 1<br> | |
| H_out = floor((H_in + 2รpadding - kernel_size) / stride) + 1 | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Mathematical Operation: Cross-Correlation</h3> | |
| <p>In deep learning, what we call "convolution" is mathematically "cross-correlation". It is a local dot product of the kernel and image patch.</p> | |
| <div class="formula"> | |
| S(i, j) = (I * K)(i, j) = ฮฃ_m ฮฃ_n I(i+m, j+n) K(m, n) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Manual Convolution</div> | |
| **Input (3x3):**<br> | |
| [1 2 0]<br> | |
| [0 1 1]<br> | |
| [1 0 2]<br> | |
| <br> | |
| **Kernel (2x2):**<br> | |
| [1 0]<br> | |
| [0 1]<br> | |
| <br> | |
| **Calculation:**<br> | |
| Step 1 (Top-Left): (1x1) + (2x0) + (0x0) + (1x1) = <strong>2</strong><br> | |
| Step 2 (Top-Right): (2x1) + (0x0) + (1x0) + (1x1) = <strong>3</strong><br> | |
| ... Output is a 2x2 matrix. | |
| </div> | |
| <h3>Backprop through Conv</h3> | |
| <p>Calculated using the same formula but with the kernel flipped vertically and horizontally (true convolution)!</p> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Feature Extraction</div> | |
| <div class="box-content"> | |
| Early layers learn edges (Gabor-like filters), middle layers learn textures, deep layers learn specific object parts (eyes, wheels). | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐จ Image Processing</div> | |
| <div class="box-content"> | |
| Blurring, sharpening, and edge detection in Photoshop/GIMP are all done with 2D convolutions using fixed kernels. | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "yolo": { | |
| overview: ` | |
| <h3>What is YOLO?</h3> | |
| <p>YOLO (You Only Look Once) treats object detection as a single regression problem, going directly from image pixels to bounding box coordinates and class probabilities in one forward pass.</p> | |
| <h3>Why YOLO Over R-CNN?</h3> | |
| <ul> | |
| <li><strong>Speed:</strong> 45+ FPS (real-time) vs R-CNN's ~0.05 FPS</li> | |
| <li><strong>Global Context:</strong> Sees entire image during training (fewer background errors)</li> | |
| <li><strong>One Network:</strong> Unlike R-CNN's multi-stage pipeline</li> | |
| <li><strong>End-to-End Training:</strong> Optimize detection directly</li> | |
| </ul> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Advantages</div> | |
| โข <strong>Lightning Fast:</strong> Real-time inference (YOLOv8 at 100+ FPS)<br> | |
| โข <strong>Simple Architecture:</strong> Single network, easy to train<br> | |
| โข <strong>Generalizes Well:</strong> Works on natural images and artwork<br> | |
| โข <strong>Small Model Size:</strong> Can run on edge devices (mobile, IoT) | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Disadvantages</div> | |
| โข <strong>Struggles with Small Objects:</strong> Grid limitation affects tiny items<br> | |
| โข <strong>Localization Errors:</strong> Less precise than two-stage detectors<br> | |
| โข <strong>Limited Objects per Cell:</strong> Can't detect many close objects<br> | |
| โข <strong>Aspect Ratio Issues:</strong> Struggles with unusual object shapes | |
| </div> | |
| <h3>YOLO Evolution</h3> | |
| <table> | |
| <tr> | |
| <th>Version</th> | |
| <th>Year</th> | |
| <th>Key Innovation</th> | |
| <th>mAP</th> | |
| </tr> | |
| <tr> | |
| <td>YOLOv1</td> | |
| <td>2015</td> | |
| <td>Original single-shot detector</td> | |
| <td>63.4%</td> | |
| </tr> | |
| <tr> | |
| <td>YOLOv3</td> | |
| <td>2018</td> | |
| <td>Multi-scale predictions</td> | |
| <td>57.9% (faster)</td> | |
| </tr> | |
| <tr> | |
| <td>YOLOv5</td> | |
| <td>2020</td> | |
| <td>PyTorch, Auto-augment</td> | |
| <td>~50% (optimized)</td> | |
| </tr> | |
| <tr> | |
| <td>YOLOv8</td> | |
| <td>2023</td> | |
| <td>Anchor-free, SOTA speed</td> | |
| <td>53.9% (real-time)</td> | |
| </tr> | |
| </table> | |
| `, | |
| concepts: ` | |
| <h3>How YOLO Works (3 Steps)</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Grid Division:</strong> Divide image into SรS grid (e.g., 7ร7). Each cell predicts B bounding boxes</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Predictions Per Cell:</strong> Each box predicts (x, y, w, h, confidence) + class probabilities</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Non-Max Suppression:</strong> Remove duplicate detections, keep highest confidence boxes</div> | |
| </div> | |
| <div class="formula"> | |
| Output Tensor Shape (YOLOv1):<br> | |
| S ร S ร (B ร 5 + C)<br> | |
| Example: 7 ร 7 ร (2 ร 5 + 20) = 7 ร 7 ร 30<br> | |
| <br> | |
| Where:<br> | |
| โข S = grid size (7)<br> | |
| โข B = boxes per cell (2)<br> | |
| โข 5 = (x, y, w, h, confidence)<br> | |
| โข C = number of classes (20 for PASCAL VOC) | |
| </div> | |
| `, | |
| applications: ` | |
| <h3>Industry Applications</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Autonomous Vehicles</div> | |
| <div class="box-content"> | |
| Real-time detection of pedestrians, vehicles, traffic signs, and lane markings for self-driving cars | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ญ Manufacturing</div> | |
| <div class="box-content"> | |
| Quality control, defect detection on assembly lines, robot guidance, inventory management | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ก๏ธ Security & Surveillance</div> | |
| <div class="box-content"> | |
| Intrusion detection, crowd monitoring, suspicious behavior analysis, license plate recognition | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Medical Imaging</div> | |
| <div class="box-content"> | |
| Tumor localization, cell counting, anatomical structure detection in X-rays/CT scans | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Intersection over Union (IoU)</h3> | |
| <p>How do we measure if a predicted box is correct? We use the geometric ratio of intersection and union.</p> | |
| <div class="formula"> | |
| IoU = Area of Overlap / Area of Union | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Manual IoU</div> | |
| **Box A (GT):** [0,0,10,10] (Area=100)<br> | |
| **Box B (Pred):** [5,5,15,15] (Area=100)<br> | |
| 1. **Intersection:** Area between [5,5] and [10,10] = 5x5 = 25<br> | |
| 2. **Union:** Area A + Area B - Intersection = 100 + 100 - 25 = 175<br> | |
| 3. **IoU:** 25 / 175 โ <strong>0.142</strong> (Poor match!) | |
| </div> | |
| <h3>YOLO Multi-Part Loss</h3> | |
| <p>YOLO uses a composite loss function combining localization, confidence, and classification errors.</p> | |
| <div class="formula"> | |
| L = ฮป_coord ฮฃ(Localization Loss) + ฮฃ(Confidence Loss) + ฮฃ(Classification Loss) | |
| </div> | |
| ` | |
| }, | |
| "transformers": { | |
| overview: ` | |
| <h3>๐ "Attention Is All You Need" (Vaswani et al., 2017)</h3> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper Reference</div> | |
| <strong>Authors:</strong> Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, ลukasz Kaiser, Illia Polosukhin<br> | |
| <strong>Published:</strong> NeurIPS 2017 | <strong>Citations:</strong> 100,000+<br> | |
| <strong>Link:</strong> <a href="https://arxiv.org/abs/1706.03762" target="_blank" style="color: var(--cyan);">arxiv.org/abs/1706.03762</a> | |
| </div> | |
| <h3>What are Transformers?</h3> | |
| <p>Transformers are neural architectures based <strong>entirely on attention mechanisms</strong>, completely eliminating recurrence and convolutions. This groundbreaking paper introduced the architecture that now powers GPT-4, Claude, BERT, and virtually all modern AI.</p> | |
| <h3>The Problem with RNNs/LSTMs</h3> | |
| <ul> | |
| <li><strong>Sequential Processing:</strong> RNNs must process tokens one-by-one, preventing parallelization</li> | |
| <li><strong>Memory Bottleneck:</strong> Hidden state must compress entire history into fixed-size vector</li> | |
| <li><strong>Long-Range Dependencies:</strong> Gradients vanish/explode over long sequences</li> | |
| <li><strong>Slow Training:</strong> Cannot leverage GPU parallelism effectively</li> | |
| </ul> | |
| <h3>The Transformer Solution</h3> | |
| <ul> | |
| <li><strong>Parallelization:</strong> Process entire sequence at once (O(1) sequential operations)</li> | |
| <li><strong>Direct Connections:</strong> Any position can attend to any other position directly</li> | |
| <li><strong>No Vanishing Gradients:</strong> Attention + residual connections maintain gradient flow</li> | |
| <li><strong>Massive Scalability:</strong> Performance scales with compute and data</li> | |
| </ul> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Paper's Key Results</div> | |
| โข <strong>WMT 2014 English-to-German:</strong> 28.4 BLEU (new SOTA, +2 over previous best)<br> | |
| โข <strong>WMT 2014 English-to-French:</strong> 41.8 BLEU (single model SOTA)<br> | |
| โข <strong>Training Time:</strong> 3.5 days on 8 GPUs (fraction of previous models)<br> | |
| โข <strong>English Constituency Parsing:</strong> Generalizes beyond translation | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Limitations Acknowledged in Paper</div> | |
| โข <strong>Quadratic Complexity:</strong> O(nยฒ) memory for sequence length n<br> | |
| โข <strong>No Positional Bias:</strong> Must explicitly inject position information<br> | |
| โข <strong>Data Hungry:</strong> Requires large datasets to train from scratch | |
| </div> | |
| <h3>Transformer Variants (Evolution)</h3> | |
| <table> | |
| <tr> | |
| <th>Model</th> | |
| <th>Type</th> | |
| <th>Architecture</th> | |
| <th>Best For</th> | |
| </tr> | |
| <tr> | |
| <td>Original Transformer</td> | |
| <td>Encoder-Decoder</td> | |
| <td>Full (6+6 layers)</td> | |
| <td>Machine Translation</td> | |
| </tr> | |
| <tr> | |
| <td>BERT (2018)</td> | |
| <td>Encoder-only</td> | |
| <td>Bidirectional</td> | |
| <td>Understanding (NLU)</td> | |
| </tr> | |
| <tr> | |
| <td>GPT (2018-2023)</td> | |
| <td>Decoder-only</td> | |
| <td>Autoregressive</td> | |
| <td>Generation (NLG)</td> | |
| </tr> | |
| <tr> | |
| <td>T5 (2019)</td> | |
| <td>Encoder-Decoder</td> | |
| <td>Text-to-Text</td> | |
| <td>All NLP tasks</td> | |
| </tr> | |
| <tr> | |
| <td>ViT (2020)</td> | |
| <td>Encoder-only</td> | |
| <td>Image patches</td> | |
| <td>Computer Vision</td> | |
| </tr> | |
| </table> | |
| `, | |
| concepts: ` | |
| <h3>๐๏ธ Complete Architecture Overview</h3> | |
| <p>The Transformer follows an <strong>Encoder-Decoder</strong> structure, but each component uses only attention mechanisms.</p> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฆ Model Hyperparameters (Base Model)</div> | |
| <div class="box-content"> | |
| โข <strong>d_model = 512:</strong> Embedding dimension<br> | |
| โข <strong>d_ff = 2048:</strong> Feed-forward hidden dimension<br> | |
| โข <strong>h = 8:</strong> Number of attention heads<br> | |
| โข <strong>d_k = d_v = 64:</strong> Key/Value dimensions (d_model / h)<br> | |
| โข <strong>N = 6:</strong> Number of encoder AND decoder layers<br> | |
| โข <strong>Total Parameters:</strong> ~65 million | |
| </div> | |
| </div> | |
| <h3>Core Components</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Input Embedding:</strong> Convert tokens to d_model dimensional vectors, scaled by โd_model</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Positional Encoding:</strong> Add sinusoidal position signals so attention knows token order</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Multi-Head Self-Attention:</strong> h parallel attention heads, each computing different relationships</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Add & Norm:</strong> Residual connection followed by Layer Normalization</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div><strong>Feed-Forward Network:</strong> Two linear layers with ReLU: FFN(x) = max(0, xWโ + bโ)Wโ + bโ</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">06</div> | |
| <div><strong>Masked Attention (Decoder):</strong> Prevent attending to future tokens during training</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">07</div> | |
| <div><strong>Encoder-Decoder Attention:</strong> Decoder attends to encoder output (cross-attention)</div> | |
| </div> | |
| <h3>Encoder Stack (N=6 layers)</h3> | |
| <div class="formula"> | |
| Each encoder layer:<br> | |
| sublayer_1 = LayerNorm(x + MultiHeadAttention(x, x, x))<br> | |
| sublayer_2 = LayerNorm(sublayer_1 + FFN(sublayer_1))<br> | |
| <br> | |
| The encoder processes the input sequence bidirectionally.<br> | |
| All positions can attend to all positions. | |
| </div> | |
| <h3>Decoder Stack (N=6 layers)</h3> | |
| <div class="formula"> | |
| Each decoder layer has THREE sub-layers:<br> | |
| 1. Masked Self-Attention (prevent looking ahead)<br> | |
| 2. Encoder-Decoder Attention (attend to encoder output)<br> | |
| 3. Feed-Forward Network<br> | |
| <br> | |
| Output: LayerNorm(x + sublayer(x)) for each | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก Key Insight: Residual Connections</div> | |
| Every sub-layer has a residual connection: <strong>output = x + Sublayer(x)</strong><br> | |
| This is crucial for:<br> | |
| 1. Training very deep models (gradient highway)<br> | |
| 2. Preserving information across layers<br> | |
| 3. Enabling optional "skipping" of transformations | |
| </div> | |
| `, | |
| applications: ` | |
| <h3>๐ Revolutionary Applications</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ Large Language Models</div> | |
| <div class="box-content"> | |
| <strong>GPT-4, Claude, Gemini:</strong> The most capable AI systems ever built<br> | |
| <strong>ChatGPT:</strong> 100M+ users in 2 months (fastest product adoption ever)<br> | |
| <strong>BERT:</strong> Powers Google Search for 70+ languages | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Machine Translation (Original Use Case)</div> | |
| <div class="box-content"> | |
| <strong>Google Translate:</strong> Switched to Transformers in 2017<br> | |
| <strong>DeepL:</strong> Transformer-based, often beats Google<br> | |
| <strong>NLLB-200:</strong> Meta's model translates 200 languages | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐จ Multi-Modal AI</div> | |
| <div class="box-content"> | |
| <strong>DALL-E 3, Midjourney, Stable Diffusion:</strong> Text-to-image generation<br> | |
| <strong>GPT-4V, Gemini:</strong> Vision + Language understanding<br> | |
| <strong>Whisper:</strong> State-of-the-art speech recognition<br> | |
| <strong>Sora:</strong> Text-to-video generation | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐งฌ Scientific Breakthroughs</div> | |
| <div class="box-content"> | |
| <strong>AlphaFold 2:</strong> Solved 50-year protein folding problem (Nobel Prize 2024)<br> | |
| <strong>ESMFold:</strong> Meta's protein predictor<br> | |
| <strong>Drug Discovery:</strong> Accelerating molecule design | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ป Code & Development</div> | |
| <div class="box-content"> | |
| <strong>GitHub Copilot:</strong> AI pair programmer (used by millions of devs)<br> | |
| <strong>Claude Code, Cursor:</strong> AI coding assistants<br> | |
| <strong>AlphaCode:</strong> Competitive programming solver | |
| </div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ Impact Statistics</div> | |
| โข Paper has 100,000+ citations (one of most cited CS papers ever)<br> | |
| โข Spawned multi-trillion dollar industry<br> | |
| โข Every major AI lab now uses Transformer variants<br> | |
| โข Considered the "ImageNet moment" for NLP | |
| </div> | |
| `, | |
| math: ` | |
| <h3>๐ Paper & Pain: Complete Mathematical Derivation</h3> | |
| <p>Let's derive every formula from the paper with step-by-step calculations.</p> | |
| <h3>1. Scaled Dot-Product Attention</h3> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 25px; border-radius: 8px;"> | |
| <strong>Attention(Q, K, V) = softmax(QKแต / โdโ) V</strong> | |
| </div> | |
| <h4>Step-by-Step Derivation:</h4> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div> | |
| <strong>Create Q, K, V matrices:</strong><br> | |
| Given input X โ โโฟหฃแต (n tokens, d dimensions)<br> | |
| Q = XWแต , K = XWแดท, V = XWโฑฝ<br> | |
| where Wแต , Wแดท โ โแตหฃแตแต and Wโฑฝ โ โแตหฃแตแต | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div> | |
| <strong>Compute Attention Scores:</strong><br> | |
| scores = QKแต โ โโฟหฃโฟ<br> | |
| Each score[i,j] = dot product of query_i and key_j<br> | |
| Measures "how much should position i attend to position j" | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div> | |
| <strong>Scale by โdโ:</strong><br> | |
| scaled_scores = QKแต / โdโ<br> | |
| <em>Why scale?</em> Dot products grow with dimension. If dโ = 64:<br> | |
| E[qแตk] = 0, Var[qแตk] = dโ = 64<br> | |
| Without scaling, softmax saturates โ vanishing gradients! | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div> | |
| <strong>Apply Softmax:</strong><br> | |
| attention_weights = softmax(scaled_scores) โ โโฟหฃโฟ<br> | |
| Each row sums to 1 (probability distribution over positions) | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div> | |
| <strong>Weighted Sum of Values:</strong><br> | |
| output = attention_weights ร V โ โโฟหฃแตแต<br> | |
| Each output[i] is a weighted combination of all values | |
| </div> | |
| </div> | |
| <h3>2. Multi-Head Attention</h3> | |
| <div class="formula" style="background: rgba(255, 107, 53, 0.08); padding: 20px; border-radius: 8px;"> | |
| MultiHead(Q, K, V) = Concat(headโ, ..., headโ)Wแดผ<br> | |
| where headแตข = Attention(QWแตขแต , KWแตขแดท, VWแตขโฑฝ)<br><br> | |
| <strong>Paper's values:</strong><br> | |
| h = 8 heads, dโ = dแตฅ = d_model/h = 512/8 = 64<br> | |
| Wแดผ โ โโฝสฐยทแตแตโพหฃแตแตแตแตแตหก = โโตยนยฒหฃโตยนยฒ | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก Why Multiple Heads?</div> | |
| Each head can learn <strong>different relationships</strong>:<br> | |
| โข Head 1: Syntactic dependencies (subject-verb)<br> | |
| โข Head 2: Pronoun resolution (he โ John)<br> | |
| โข Head 3: Semantic similarity<br> | |
| โข Head 4-8: Other patterns<br> | |
| <br> | |
| <strong>Computation:</strong> Same cost as single-head with full dโ!<br> | |
| 8 heads ร 64 dim = 1 head ร 512 dim (same FLOPs) | |
| </div> | |
| <h3>3. Positional Encoding (Sinusoidal)</h3> | |
| <div class="formula" style="background: rgba(46, 204, 113, 0.08); padding: 20px; border-radius: 8px;"> | |
| PE(pos, 2i) = sin(pos / 10000^(2i/d_model))<br> | |
| PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))<br><br> | |
| pos = position in sequence (0, 1, 2, ...)<br> | |
| i = dimension index (0, 1, ..., d_model/2 - 1) | |
| </div> | |
| <h4>Worked Example (d_model = 4):</h4> | |
| <div class="formula" style="background: var(--surface); padding: 15px;"> | |
| For position pos = 0:<br> | |
| PE(0, 0) = sin(0) = 0<br> | |
| PE(0, 1) = cos(0) = 1<br> | |
| PE(0, 2) = sin(0) = 0<br> | |
| PE(0, 3) = cos(0) = 1<br> | |
| โ PEโ = [0, 1, 0, 1]<br><br> | |
| For position pos = 1:<br> | |
| PE(1, 0) = sin(1/10000โฐ) = sin(1) โ 0.84<br> | |
| PE(1, 1) = cos(1/10000โฐ) = cos(1) โ 0.54<br> | |
| PE(1, 2) = sin(1/100) โ 0.01<br> | |
| PE(1, 3) = cos(1/100) โ 1.00<br> | |
| โ PEโ = [0.84, 0.54, 0.01, 1.00] | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">๐ Key Property: Relative Positions</div> | |
| PE(pos + k) can be expressed as a <strong>linear function</strong> of PE(pos)!<br> | |
| This allows the model to easily learn relative position attention.<br> | |
| sin(a+b) = sin(a)cos(b) + cos(a)sin(b) โ | |
| </div> | |
| <h3>4. Feed-Forward Network</h3> | |
| <div class="formula"> | |
| FFN(x) = max(0, xWโ + bโ)Wโ + bโ<br><br> | |
| Wโ โ โแตแตแตแตแตหก หฃ แตแถ แถ = โโตยนยฒ หฃ ยฒโฐโดโธ (expand 4ร)<br> | |
| Wโ โ โแตแถ แถ หฃ แตแตแตแตแตหก = โยฒโฐโดโธ หฃ โตยนยฒ (project back)<br><br> | |
| This is a 2-layer MLP with ReLU, applied to each position independently. | |
| </div> | |
| <h3>5. Layer Normalization</h3> | |
| <div class="formula"> | |
| LayerNorm(x) = ฮณ โ (x - ฮผ) / โ(ฯยฒ + ฮต) + ฮฒ<br><br> | |
| ฮผ = (1/d) ฮฃแตข xแตข (mean across features)<br> | |
| ฯยฒ = (1/d) ฮฃแตข (xแตข - ฮผ)ยฒ (variance)<br> | |
| ฮณ, ฮฒ: learnable scale and shift (per feature) | |
| </div> | |
| <h3>6. Training Details from Paper</h3> | |
| <div class="info-box"> | |
| <div class="box-title">โ๏ธ Optimization Settings</div> | |
| <div class="box-content"> | |
| <strong>Optimizer:</strong> Adam with ฮฒโ=0.9, ฮฒโ=0.98, ฮต=10โปโน<br> | |
| <strong>Learning Rate Schedule:</strong><br> | |
| lr = d_modelโปโฐยทโต ร min(stepโปโฐยทโต, step ร warmup_stepsโปยนยทโต)<br> | |
| warmup_steps = 4000<br><br> | |
| <strong>Regularization:</strong><br> | |
| โข Dropout = 0.1 (on sublayers and embeddings)<br> | |
| โข Label Smoothing = 0.1 | |
| </div> | |
| </div> | |
| <h3>7. Complexity Analysis</h3> | |
| <table> | |
| <tr> | |
| <th>Layer Type</th> | |
| <th>Complexity per Layer</th> | |
| <th>Sequential Ops</th> | |
| <th>Max Path Length</th> | |
| </tr> | |
| <tr> | |
| <td>Self-Attention</td> | |
| <td>O(nยฒ ยท d)</td> | |
| <td>O(1)</td> | |
| <td>O(1)</td> | |
| </tr> | |
| <tr> | |
| <td>Recurrent (RNN)</td> | |
| <td>O(n ยท dยฒ)</td> | |
| <td>O(n)</td> | |
| <td>O(n)</td> | |
| </tr> | |
| <tr> | |
| <td>Convolutional</td> | |
| <td>O(k ยท n ยท dยฒ)</td> | |
| <td>O(1)</td> | |
| <td>O(logโ(n))</td> | |
| </tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ฏ Key Insight from the Paper</div> | |
| Self-attention is the only architecture with <strong>O(1) path length</strong><br> | |
| between any two positions! This is why it handles long-range<br> | |
| dependencies so well. The tradeoff is O(nยฒ) memory. | |
| </div> | |
| `, | |
| visualization: ` | |
| <h3>๐จ Interactive Transformer Visualization</h3> | |
| <p>Explore how attention mechanisms work in the Transformer architecture.</p> | |
| <div class="viz-container"> | |
| <canvas id="transformerViz" width="800" height="600"></canvas> | |
| </div> | |
| <div class="viz-controls"> | |
| <button class="btn-viz" onclick="visualizeAttention()">Show Attention Weights</button> | |
| <button class="btn-viz" onclick="visualizeMultiHead()">Multi-Head Attention</button> | |
| <button class="btn-viz" onclick="visualizePositional()">Positional Encoding</button> | |
| <button class="btn-viz" onclick="visualizeArchitecture()">Full Architecture</button> | |
| </div> | |
| <h3>Code: Self-Attention in PyTorch</h3> | |
| <div class="formula" style="font-family: 'Courier New', monospace; font-size: 0.9rem; background: #1e1e2e; padding: 20px; border-radius: 8px; overflow-x: auto;"> | |
| <pre style="margin: 0; color: #cdd6f4;"> | |
| <span style="color: #89b4fa;">import</span> torch | |
| <span style="color: #89b4fa;">import</span> torch.nn <span style="color: #89b4fa;">as</span> nn | |
| <span style="color: #89b4fa;">import</span> torch.nn.functional <span style="color: #89b4fa;">as</span> F | |
| <span style="color: #89b4fa;">import</span> math | |
| <span style="color: #89b4fa;">class</span> <span style="color: #f9e2af;">ScaledDotProductAttention</span>(nn.Module): | |
| <span style="color: #a6e3a1;">\"\"\"Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) V\"\"\"</span> | |
| <span style="color: #89b4fa;">def</span> <span style="color: #f9e2af;">forward</span>(self, Q, K, V, mask=<span style="color: #fab387;">None</span>): | |
| d_k = Q.size(-1) | |
| <span style="color: #6c7086;"># Step 1: Compute attention scores</span> | |
| scores = torch.matmul(Q, K.transpose(-2, -1)) | |
| <span style="color: #6c7086;"># Step 2: Scale by sqrt(d_k)</span> | |
| scores = scores / math.sqrt(d_k) | |
| <span style="color: #6c7086;"># Step 3: Apply mask (for decoder)</span> | |
| <span style="color: #89b4fa;">if</span> mask <span style="color: #89b4fa;">is not</span> <span style="color: #fab387;">None</span>: | |
| scores = scores.masked_fill(mask == <span style="color: #fab387;">0</span>, <span style="color: #fab387;">-1e9</span>) | |
| <span style="color: #6c7086;"># Step 4: Apply softmax</span> | |
| attention_weights = F.softmax(scores, dim=-1) | |
| <span style="color: #6c7086;"># Step 5: Weighted sum of values</span> | |
| output = torch.matmul(attention_weights, V) | |
| <span style="color: #89b4fa;">return</span> output, attention_weights | |
| <span style="color: #89b4fa;">class</span> <span style="color: #f9e2af;">MultiHeadAttention</span>(nn.Module): | |
| <span style="color: #a6e3a1;">\"\"\"Multi-Head Attention from 'Attention Is All You Need'\"\"\"</span> | |
| <span style="color: #89b4fa;">def</span> <span style="color: #f9e2af;">__init__</span>(self, d_model=<span style="color: #fab387;">512</span>, n_heads=<span style="color: #fab387;">8</span>): | |
| <span style="color: #89b4fa;">super</span>().__init__() | |
| self.d_model = d_model | |
| self.n_heads = n_heads | |
| self.d_k = d_model // n_heads <span style="color: #6c7086;"># 64</span> | |
| <span style="color: #6c7086;"># Linear projections for Q, K, V</span> | |
| self.W_Q = nn.Linear(d_model, d_model) | |
| self.W_K = nn.Linear(d_model, d_model) | |
| self.W_V = nn.Linear(d_model, d_model) | |
| self.W_O = nn.Linear(d_model, d_model) | |
| self.attention = ScaledDotProductAttention() | |
| <span style="color: #89b4fa;">def</span> <span style="color: #f9e2af;">forward</span>(self, Q, K, V, mask=<span style="color: #fab387;">None</span>): | |
| batch_size = Q.size(<span style="color: #fab387;">0</span>) | |
| <span style="color: #6c7086;"># 1. Linear projection and split into h heads</span> | |
| Q = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) | |
| K = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) | |
| V = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) | |
| <span style="color: #6c7086;"># 2. Apply attention to all heads in parallel</span> | |
| x, attn = self.attention(Q, K, V, mask) | |
| <span style="color: #6c7086;"># 3. Concat and final linear projection</span> | |
| x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) | |
| <span style="color: #89b4fa;">return</span> self.W_O(x), attn | |
| <span style="color: #89b4fa;">class</span> <span style="color: #f9e2af;">PositionalEncoding</span>(nn.Module): | |
| <span style="color: #a6e3a1;">\"\"\"Sinusoidal Positional Encoding from the paper\"\"\"</span> | |
| <span style="color: #89b4fa;">def</span> <span style="color: #f9e2af;">__init__</span>(self, d_model=<span style="color: #fab387;">512</span>, max_len=<span style="color: #fab387;">5000</span>): | |
| <span style="color: #89b4fa;">super</span>().__init__() | |
| pe = torch.zeros(max_len, d_model) | |
| position = torch.arange(<span style="color: #fab387;">0</span>, max_len, dtype=torch.float).unsqueeze(<span style="color: #fab387;">1</span>) | |
| div_term = torch.exp(torch.arange(<span style="color: #fab387;">0</span>, d_model, <span style="color: #fab387;">2</span>).float() * (-math.log(<span style="color: #fab387;">10000.0</span>) / d_model)) | |
| pe[:, <span style="color: #fab387;">0</span>::<span style="color: #fab387;">2</span>] = torch.sin(position * div_term) | |
| pe[:, <span style="color: #fab387;">1</span>::<span style="color: #fab387;">2</span>] = torch.cos(position * div_term) | |
| pe = pe.unsqueeze(<span style="color: #fab387;">0</span>) <span style="color: #6c7086;"># [1, max_len, d_model]</span> | |
| self.register_buffer(<span style="color: #a6e3a1;">'pe'</span>, pe) | |
| <span style="color: #89b4fa;">def</span> <span style="color: #f9e2af;">forward</span>(self, x): | |
| <span style="color: #89b4fa;">return</span> x + self.pe[:, :x.size(<span style="color: #fab387;">1</span>)] | |
| <span style="color: #6c7086;"># Example usage:</span> | |
| batch_size, seq_len, d_model = <span style="color: #fab387;">2</span>, <span style="color: #fab387;">10</span>, <span style="color: #fab387;">512</span> | |
| x = torch.randn(batch_size, seq_len, d_model) | |
| mha = MultiHeadAttention(d_model=<span style="color: #fab387;">512</span>, n_heads=<span style="color: #fab387;">8</span>) | |
| pe = PositionalEncoding(d_model=<span style="color: #fab387;">512</span>) | |
| x = pe(x) <span style="color: #6c7086;"># Add positional encoding</span> | |
| output, attn_weights = mha(x, x, x) <span style="color: #6c7086;"># Self-attention</span> | |
| <span style="color: #89b4fa;">print</span>(f<span style="color: #a6e3a1;">"Output shape: {output.shape}"</span>) <span style="color: #6c7086;"># [2, 10, 512]</span> | |
| <span style="color: #89b4fa;">print</span>(f<span style="color: #a6e3a1;">"Attention shape: {attn_weights.shape}"</span>) <span style="color: #6c7086;"># [2, 8, 10, 10]</span> | |
| </pre> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ Run This Code!</div> | |
| Copy the code above and run it in a Jupyter notebook or Google Colab.<br> | |
| Experiment with different d_model, n_heads, and seq_len values! | |
| </div> | |
| ` | |
| }, | |
| "perceptron": { | |
| overview: ` | |
| <h3>What is a Perceptron?</h3> | |
| <p>The perceptron is the simplest neural network, invented in 1958. It's a binary linear classifier that makes predictions based on weighted inputs.</p> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Advantages</div> | |
| โข Simple and fast<br> | |
| โข Guaranteed convergence for linearly separable data<br> | |
| โข Interpretable weights | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Key Limitation</div> | |
| <strong>Cannot solve XOR:</strong> Limited to linear decision boundaries only | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>How Perceptron Works</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Weighted Sum:</strong> z = wโxโ + wโxโ + ... + b</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Step Function:</strong> Output = 1 if z โฅ 0, else 0</div> | |
| </div> | |
| <div class="formula"> | |
| Learning Rule: w_new = w_old + ฮฑ(y_true - y_pred)x | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Perceptron Learning Algorithm</h3> | |
| <p>The perceptron update rule is the simplest form of gradient descent.</p> | |
| <div class="formula"> | |
| For each misclassified sample (x, y):<br> | |
| w โ w + ฮฑ ร y ร x<br> | |
| b โ b + ฮฑ ร y | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Manual Training</div> | |
| <strong>Data:</strong> xโ = [1, 1], yโ = 1 | xโ = [0, 0], yโ = 0<br> | |
| <strong>Initial:</strong> w = [0, 0], b = 0, ฮฑ = 1<br> | |
| <br> | |
| <strong>Iteration 1 (xโ):</strong><br> | |
| z = 0ร1 + 0ร1 + 0 = 0 โ ลท = 1 โ (correct!)<br> | |
| <br> | |
| <strong>Iteration 2 (xโ):</strong><br> | |
| z = 0ร0 + 0ร0 + 0 = 0 โ ลท = 1 โ (wrong! y=0)<br> | |
| Update: w = [0,0] + 1ร(0-1)ร[0,0] = [0,0], b = 0 + 1ร(0-1) = -1<br> | |
| <br> | |
| Now z(xโ) = 0 + 0 - 1 = -1 โ ลท = 0 โ | |
| </div> | |
| <h3>Convergence Theorem</h3> | |
| <div class="formula"> | |
| If data is linearly separable with margin ฮณ and ||x|| โค R,<br> | |
| perceptron converges in at most (R/ฮณ)ยฒ updates. | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Educational</div> | |
| <div class="box-content"> | |
| Historical importance - first trainable neural model. Perfect for teaching ML fundamentals | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ Simple Classification</div> | |
| <div class="box-content"> | |
| Linearly separable problems: basic pattern recognition, simple binary decisions | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "mlp": { | |
| overview: ` | |
| <h3>Multi-Layer Perceptron (MLP)</h3> | |
| <p>MLP adds hidden layers between input and output, enabling non-linear decision boundaries and solving the XOR problem that single perceptrons cannot.</p> | |
| <h3>Why MLPs?</h3> | |
| <ul> | |
| <li><strong>Universal Approximation:</strong> Can approximate any continuous function</li> | |
| <li><strong>Non-Linear Learning:</strong> Solves complex problems</li> | |
| <li><strong>Feature Extraction:</strong> Hidden layers learn hierarchical features</li> | |
| </ul> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก The XOR Breakthrough</div> | |
| Single perceptron: Cannot solve XOR<br> | |
| MLP with 1 hidden layer (2 neurons): Solves XOR!<br> | |
| This proves the power of depth. | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Architecture Components</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Input Layer:</strong> Raw features (no computation)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Hidden Layers:</strong> Extract progressively abstract features</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Output Layer:</strong> Final predictions</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Tabular Data</div> | |
| <div class="box-content">Credit scoring, fraud detection, customer churn, sales forecasting</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ญ Manufacturing</div> | |
| <div class="box-content">Quality control, predictive maintenance, demand forecasting</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Neural Network Forward Pass (Matrix Form)</h3> | |
| <p>Vectorization is key to modern deep learning. We process entire layers as matrix multiplications.</p> | |
| <div class="formula"> | |
| Layer 1: zโฝยนโพ = Wโฝยนโพx + bโฝยนโพ | aโฝยนโพ = ฯ(zโฝยนโพ)<br> | |
| Layer 2: zโฝยฒโพ = Wโฝยฒโพaโฝยนโพ + bโฝยฒโพ | aโฝยฒโพ = ฯ(zโฝยฒโพ)<br> | |
| ...<br> | |
| Layer L: ลท = Softmax(Wโฝแดธโพaโฝแดธโปยนโพ + bโฝแดธโพ) | |
| </div> | |
| <h3>Paper & Pain: Dimensionality Audit</h3> | |
| <p>Understanding tensor shapes is the #1 skill for debugging neural networks.</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Input x:</strong> [n_features, 1]</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Weights Wโฝยนโพ:</strong> [n_hidden, n_features]</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Bias bโฝยนโพ:</strong> [n_hidden, 1]</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Solving XOR</div> | |
| Input: [0,1], Target: 1<br> | |
| Layer 1 (2 neurons):<br> | |
| zโ = 10xโ + 10xโ - 5 | aโ = ฯ(zโ)<br> | |
| zโ = 10xโ + 10xโ - 15 | aโ = ฯ(zโ)<br> | |
| Layer 2 (1 neuron):<br> | |
| y = ฯ(20aโ - 20aโ - 10)<br> | |
| <strong>Try it on paper!</strong> This specific configuration correctly outputs XOR values. | |
| </div> | |
| ` | |
| }, | |
| "weight-init": { | |
| overview: ` | |
| <h3>Weight Initialization Strategies</h3> | |
| <table> | |
| <tr> | |
| <th>Method</th> | |
| <th>Best For</th> | |
| <th>Formula</th> | |
| </tr> | |
| <tr> | |
| <td>Xavier/Glorot</td> | |
| <td>Sigmoid, Tanh</td> | |
| <td>N(0, โ(2/(n_in+n_out)))</td> | |
| </tr> | |
| <tr> | |
| <td>He/Kaiming</td> | |
| <td>ReLU</td> | |
| <td>N(0, โ(2/n_in))</td> | |
| </tr> | |
| </table> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Never Initialize to Zero!</div> | |
| All neurons learn identical features (symmetry problem) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Key Principles</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Variance Preservation:</strong> Keep activation variance similar across layers</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Symmetry Breaking:</strong> Different weights force different features</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Critical for Deep Networks</div> | |
| <div class="box-content"> | |
| Proper initialization is essential for training networks >10 layers. Wrong init = training failure | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">โก Faster Convergence</div> | |
| <div class="box-content"> | |
| Good initialization reduces training time by 2-10ร, especially with modern optimizers | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Variance Preservation Principle</h3> | |
| <p>To prevent gradients from vanishing or exploding, we want the variance of the activations to remain constant across layers.</p> | |
| <div class="formula"> | |
| For a linear layer: y = ฮฃ wแตขxแตข<br> | |
| Var(y) = Var(ฮฃ wแตขxแตข) = ฮฃ Var(wแตขxแตข)<br> | |
| Assuming w and x are independent with mean 0:<br> | |
| Var(wแตขxแตข) = E[wแตขยฒ]E[xแตขยฒ] - E[wแตข]ยฒE[xแตข]ยฒ = Var(wแตข)Var(xแตข)<br> | |
| So, Var(y) = n_in ร Var(w) ร Var(x) | |
| </div> | |
| <h3>1. Xavier (Glorot) Initialization</h3> | |
| <p>Goal: Var(y) = Var(x) and Var(grad_out) = Var(grad_in)</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Forward Pass:</strong> n_in ร Var(w) = 1 โ Var(w) = 1/n_in</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Backward Pass:</strong> n_out ร Var(w) = 1 โ Var(w) = 1/n_out</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Compromise:</strong> Var(w) = 2 / (n_in + n_out)</div> | |
| </div> | |
| <h3>2. He (Kaiming) Initialization</h3> | |
| <p>For ReLU activation, half the neurons are inactive (output 0), which halves the variance. We must compensate.</p> | |
| <div class="formula"> | |
| Var(ReLU(y)) = 1/2 ร Var(y)<br> | |
| To keep Var(ReLU(y)) = Var(x):<br> | |
| 1/2 ร n_in ร Var(w) = 1<br> | |
| <strong>Var(w) = 2 / n_in</strong> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain Calculation</div> | |
| If n_in = 256 and you use ReLU:<br> | |
| Weight Std Dev = โ(2/256) = โ(1/128) โ <strong>0.088</strong><br> | |
| Initializing with std=1.0 or std=0.01 would cause immediate failure in a deep net! | |
| </div> | |
| ` | |
| }, | |
| "loss": { | |
| overview: ` | |
| <h3>Loss Functions Guide</h3> | |
| <table> | |
| <tr> | |
| <th>Task</th> | |
| <th>Loss Function</th> | |
| </tr> | |
| <tr> | |
| <td>Binary Classification</td> | |
| <td>Binary Cross-Entropy</td> | |
| </tr> | |
| <tr> | |
| <td>Multi-class</td> | |
| <td>Categorical Cross-Entropy</td> | |
| </tr> | |
| <tr> | |
| <td>Regression</td> | |
| <td>MSE or MAE</td> | |
| </tr> | |
| </table> | |
| `, | |
| concepts: ` | |
| <h3>Common Loss Functions</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>MSE:</strong> (1/n)ฮฃ(y - ลท)ยฒ - Penalizes large errors, sensitive to outliers</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>MAE:</strong> (1/n)ฮฃ|y - ลท| - Robust to outliers, constant gradient, slower convergence</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Huber Loss:</strong> MSE when |error| โค ฮด, MAE otherwise. Best of both โ smooth + robust to outliers</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>BCE (Binary Cross-Entropy):</strong> -[yยทlog(ลท) + (1-y)ยทlog(1-ลท)] - Used with Sigmoid</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div><strong>CCE (Categorical Cross-Entropy):</strong> -ฮฃ yยทlog(ลท) - Used with Softmax for multi-class</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">06</div> | |
| <div><strong>Hinge Loss:</strong> max(0, 1 - yยทลท) where y โ {-1, +1} - Margin-based, SVM-style</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Task-Dependent Selection</div> | |
| <div class="box-content"> | |
| Every ML task needs appropriate loss: classification (cross-entropy), regression (MSE/MAE), ranking (triplet loss) | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Custom Losses</div> | |
| <div class="box-content"> | |
| Business-specific objectives: Focal Loss (imbalanced data), Dice Loss (segmentation), Contrastive Loss (similarity learning) | |
| </div> | |
| </div> | |
| <h3>Loss Function Comparison</h3> | |
| <table> | |
| <tr><th>Loss</th><th>Type</th><th>Outlier Sensitivity</th><th>Key Property</th></tr> | |
| <tr><td>MSE</td><td>Regression</td><td>High</td><td>Penalizes large errors heavily</td></tr> | |
| <tr><td>MAE</td><td>Regression</td><td>Low</td><td>Robust, constant gradient</td></tr> | |
| <tr><td>Huber</td><td>Regression</td><td>Medium</td><td>Smooth + robust (MSE+MAE combo)</td></tr> | |
| <tr><td>BCE</td><td>Binary Class.</td><td>High</td><td>Strong gradients for wrong predictions</td></tr> | |
| <tr><td>CCE</td><td>Multi-class</td><td>High</td><td>Outputs probabilities via Softmax</td></tr> | |
| <tr><td>Hinge</td><td>Binary Class.</td><td>Medium</td><td>Margin-based, less probabilistic</td></tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ค Probable Interview Questions</div> | |
| 1. Difference between MSE and MAE?<br> | |
| 2. Why Huber loss is preferred sometimes?<br> | |
| 3. Why BCE with sigmoid?<br> | |
| 4. Why softmax with CCE?<br> | |
| 5. Why can't we use MSE for classification?<br> | |
| 6. What is Hinge loss and where is it used?<br> | |
| 7. Difference between loss function and evaluation metric?<br> | |
| 8. How does loss choice affect gradients?<br> | |
| 9. What is Focal Loss and when to use it?<br> | |
| 10. Can we design custom loss functions? | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Binary Cross-Entropy (BCE) Derivation</h3> | |
| <p>Why do we use logs? BCE is derived from Maximum Likelihood Estimation (MLE) assuming a Bernoulli distribution.</p> | |
| <div class="formula"> | |
| L(ลท, y) = -(y log(ลท) + (1-y) log(1-ลท)) | |
| </div> | |
| <h3>Huber Loss (Smooth MAE)</h3> | |
| <p>Combines MSE for small errors and MAE for large errors using threshold ฮด:</p> | |
| <div class="formula"> | |
| L = ยฝ(y - ลท)ยฒ when |y - ลท| โค ฮด<br> | |
| L = ฮด|y - ลท| - ยฝฮดยฒ otherwise | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Huber Intuition</div> | |
| <strong>Small error (|error| โค ฮด):</strong> Behaves like MSE โ smooth, differentiable<br> | |
| <strong>Large error (|error| > ฮด):</strong> Behaves like MAE โ doesn't blow up for outliers<br><br> | |
| Best of both worlds! Used when data contains mild outliers. | |
| </div> | |
| <h3>Hinge Loss (SVM-style)</h3> | |
| <div class="formula"> | |
| L = (1/n) ฮฃ max(0, 1 - yยทลท) where y โ {-1, +1} | |
| </div> | |
| <p>Margin-based loss: only penalizes predictions within the margin boundary. Used in SVMs and some neural network classifiers.</p> | |
| <h3>Paper & Pain: Why not MSE for Classification?</h3> | |
| <p>If we use MSE for sigmoid output, the gradient is:</p> | |
| <div class="formula"> | |
| โL/โw = (ลท - y) <strong>ฯ'(z)</strong> x | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ The Saturation Problem</div> | |
| If the model is very wrong (e.g., target 1, output 0.001), ฯ'(z) is near 0. <br> | |
| The gradient vanishes, and the model <strong>stops learning!</strong>. | |
| </div> | |
| <h3>The BCE Advantage</h3> | |
| <p>When using BCE, the ฯ'(z) term cancels out! The gradient becomes:</p> | |
| <div class="formula" style="font-size: 1.2rem; color: #00d4ff;"> | |
| โL/โw = (ลท - y) x | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">๐ก</div> | |
| <div>This is beautiful: the gradient depends <strong>only on the error</strong> (ลท-y), not on how saturated the neuron is. This enables much faster training.</div> | |
| </div> | |
| ` | |
| }, | |
| "optimizers": { | |
| overview: ` | |
| <h3>Optimizer Selection Guide</h3> | |
| <table> | |
| <tr> | |
| <th>Optimizer</th> | |
| <th>When to Use</th> | |
| </tr> | |
| <tr> | |
| <td>Adam/AdamW</td> | |
| <td><strong>Default choice</strong> - works 90% of time</td> | |
| </tr> | |
| <tr> | |
| <td>SGD + Momentum</td> | |
| <td>CNNs (better final accuracy with patience)</td> | |
| </tr> | |
| <tr> | |
| <td>RMSprop</td> | |
| <td>RNNs</td> | |
| </tr> | |
| </table> | |
| <div class="formula"> | |
| Adam: m_t = ฮฒโยทm + (1-ฮฒโ)ยทโL<br> | |
| v_t = ฮฒโยทv + (1-ฮฒโ)ยท(โL)ยฒ<br> | |
| w = w - ฮฑยทm_t/โ(v_t) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Optimizer Evolution</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>SGD:</strong> Simple but requires careful learning rate tuning</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Adam:</strong> Adaptive rates + momentum = works out-of-box</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Training Acceleration</div> | |
| <div class="box-content"> | |
| Modern optimizers (Adam) reduce training time by 5-10ร compared to basic SGD | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Architecture-Specific</div> | |
| <div class="box-content"> | |
| CNNs: SGD+Momentum | Transformers: AdamW | RNNs: RMSprop | Default: Adam | |
| </div> | |
| </div> | |
| <h3>Optimizer Comparison</h3> | |
| <table> | |
| <tr><th>Optimizer</th><th>Key Idea</th><th>Problem</th></tr> | |
| <tr><td>SGD</td><td>Simple, fast</td><td>Noisy convergence</td></tr> | |
| <tr><td>Momentum</td><td>Smooths updates</td><td>Needs tuning</td></tr> | |
| <tr><td>AdaGrad</td><td>Adaptive LR</td><td>LR shrinks too much</td></tr> | |
| <tr><td>RMSProp</td><td>Fixes AdaGrad</td><td>No momentum</td></tr> | |
| <tr><td><strong>Adam</strong></td><td><strong>Best of all</strong></td><td>Slightly more computation</td></tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ค Probable Interview Questions</div> | |
| 1. Difference between optimizer and gradient descent?<br> | |
| 2. Why does SGD oscillate?<br> | |
| 3. Why does AdaGrad fail in deep networks?<br> | |
| 4. How does RMSProp fix AdaGrad?<br> | |
| 5. Why is bias correction needed in Adam?<br> | |
| 6. What happens if learning rate is too high?<br> | |
| 7. When would you prefer SGD over Adam?<br> | |
| 8. What is momentum intuitively?<br> | |
| 9. Why is Adam the default choice?<br> | |
| 10. Can Adam overfit? | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Gradient Descent: The Foundation</h3> | |
| <p>All optimizers are variations of gradient descent. The goal: minimize the loss function L(w).</p> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 25px; border-radius: 8px;"> | |
| <strong>w = w - ฮฑ ร โL(w)</strong> | |
| </div> | |
| <h3>1. Stochastic Gradient Descent (SGD)</h3> | |
| <div class="formula"> | |
| w_{t+1} = w_t - ฮฑ ร โL(w_t)<br><br> | |
| Where ฮฑ = learning rate (typically 0.01 - 0.1) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: SGD Step</div> | |
| <strong>Current w:</strong> 2.0, <strong>Gradient:</strong> โL = 0.5, <strong>ฮฑ:</strong> 0.1<br><br> | |
| w_new = 2.0 - 0.1 ร 0.5 = 2.0 - 0.05 = <strong>1.95</strong><br> | |
| The weight moved slightly toward lower loss! | |
| </div> | |
| <h3>2. SGD with Momentum</h3> | |
| <p>Adds a "velocity" term to accelerate through flat regions and dampen oscillations.</p> | |
| <div class="formula"> | |
| v_{t+1} = ฮฒ ร v_t + โL(w_t)<br> | |
| w_{t+1} = w_t - ฮฑ ร v_{t+1}<br><br> | |
| Where ฮฒ = momentum coefficient (typically 0.9) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Momentum Step</div> | |
| <strong>v_t:</strong> 0.3, <strong>โL:</strong> 0.5, <strong>ฮฒ:</strong> 0.9, <strong>ฮฑ:</strong> 0.1<br><br> | |
| v_new = 0.9 ร 0.3 + 0.5 = 0.27 + 0.5 = <strong>0.77</strong><br> | |
| w_new = w - 0.1 ร 0.77 = <strong>larger step!</strong><br> | |
| Momentum accumulates past gradients for faster convergence. | |
| </div> | |
| <h3>3. AdaGrad (Adaptive Gradient)</h3> | |
| <p>Adapts learning rate per-parameter based on how frequently each parameter is updated.</p> | |
| <div class="formula"> | |
| <strong>Accumulated Gradient:</strong><br> | |
| G_t = G_{t-1} + (โL)ยฒ<br><br> | |
| <strong>Update Rule:</strong><br> | |
| w_{t+1} = w_t - ฮท / โ(G_t + ฮต) ร โL<br><br> | |
| Where ฮต = 1e-8 (numerical stability) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: AdaGrad Intuition</div> | |
| <strong>Frequent parameters</strong> โ G_t grows fast โ learning rate shrinks<br> | |
| <strong>Rare parameters</strong> โ G_t stays small โ learning rate stays large<br><br> | |
| <strong>Problem:</strong> G_t only accumulates (never forgets), so learning rate keeps shrinking and training may stop early!<br> | |
| <strong>This is exactly why RMSprop was invented โ</strong> | |
| </div> | |
| <h3>4. RMSprop (Root Mean Square Propagation)</h3> | |
| <p>Fixes AdaGrad's shrinking problem by using a <strong>decaying average</strong> of recent squared gradients instead of summing all.</p> | |
| <div class="formula"> | |
| v_t = ฮฒ ร v_{t-1} + (1-ฮฒ) ร (โL)ยฒ<br> | |
| w_{t+1} = w_t - ฮฑ ร โL / โ(v_t + ฮต)<br><br> | |
| ฮฒ = 0.9, ฮต = 1e-8 (numerical stability) | |
| </div> | |
| <h3>5. Adam (Adaptive Moment Estimation)</h3> | |
| <p>Combines momentum (from SGD) AND adaptive learning rates (from RMSprop). The most popular optimizer.</p> | |
| <div class="formula" style="background: rgba(255, 107, 53, 0.08); padding: 20px; border-radius: 8px;"> | |
| <strong>Step 1 - First Moment (Momentum):</strong><br> | |
| m_t = ฮฒโ ร m_{t-1} + (1-ฮฒโ) ร โL<br><br> | |
| <strong>Step 2 - Second Moment (RMSprop):</strong><br> | |
| v_t = ฮฒโ ร v_{t-1} + (1-ฮฒโ) ร (โL)ยฒ<br><br> | |
| <strong>Step 3 - Bias Correction:</strong><br> | |
| mฬ_t = m_t / (1 - ฮฒโแต)<br> | |
| vฬ_t = v_t / (1 - ฮฒโแต)<br><br> | |
| <strong>Step 4 - Update:</strong><br> | |
| w_{t+1} = w_t - ฮฑ ร mฬ_t / (โvฬ_t + ฮต) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Adam Step-by-Step</div> | |
| <strong>Hyperparameters:</strong> ฮฑ=0.001, ฮฒโ=0.9, ฮฒโ=0.999, ฮต=1e-8<br> | |
| <strong>t=2:</strong> โL = 0.5, mโ = 0.05, vโ = 0.00025<br><br> | |
| mโ = 0.9 ร 0.05 + 0.1 ร 0.5 = 0.045 + 0.05 = 0.095<br> | |
| vโ = 0.999 ร 0.00025 + 0.001 ร 0.25 = 0.000499<br><br> | |
| mฬโ = 0.095 / (1 - 0.81) = 0.095 / 0.19 = 0.50<br> | |
| vฬโ = 0.000499 / (1 - 0.998) = 0.2495<br><br> | |
| ฮw = 0.001 ร 0.50 / (โ0.2495 + 1e-8) โ <strong>0.001</strong> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Why Bias Correction?</div> | |
| mโ = 0, vโ = 0 initialization biases early estimates toward zero.<br> | |
| Dividing by (1 - ฮฒแต) compensates for this, especially in early training steps. | |
| </div> | |
| ` | |
| }, | |
| "backprop": { | |
| overview: ` | |
| <h3>Forward & Backpropagation</h3> | |
| <p>The neural network training loop consists of two passes: <strong>forward propagation</strong> (compute predictions) and <strong>backpropagation</strong> (compute gradients for updates).</p> | |
| <h3>Forward Propagation</h3> | |
| <p>The process of moving inputs through the network to produce an output:</p> | |
| <div class="formula"> | |
| Input โ Weighted Sum โ Activation โ Output | |
| </div> | |
| <p>This happens: for every batch, in every epoch, before computing loss.</p> | |
| <h3>Training Pipeline</h3> | |
| <table> | |
| <tr><th>Component</th><th>Role</th></tr> | |
| <tr><td>Forward Propagation</td><td>Computes predictions</td></tr> | |
| <tr><td>Loss Function</td><td>Computes error</td></tr> | |
| <tr><td>Backpropagation</td><td>Computes gradients</td></tr> | |
| <tr><td>Gradient Descent</td><td>Updates weights</td></tr> | |
| </table> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Key Distinction</div> | |
| Backpropagation does <strong>NOT</strong> update weights โ it only computes gradients.<br> | |
| <strong>Gradient Descent</strong> (or any optimizer) does the actual weight update! | |
| </div> | |
| <h3>Training Terminologies</h3> | |
| <table> | |
| <tr><th>Term</th><th>Meaning</th><th>Example (1000 samples, batch=100)</th></tr> | |
| <tr><td>Batch</td><td>Subset of data</td><td>100 samples</td></tr> | |
| <tr><td>Batch Size</td><td>Samples per batch</td><td>100</td></tr> | |
| <tr><td>Steps per Epoch</td><td>Total / Batch Size</td><td>1000/100 = 10</td></tr> | |
| <tr><td>Iteration</td><td>One batch update</td><td>1 step</td></tr> | |
| <tr><td>Epoch</td><td>One full pass of dataset</td><td>10 iterations</td></tr> | |
| </table> | |
| `, | |
| concepts: ` | |
| <div class="formula"> | |
| Chain Rule:<br> | |
| โL/โw = โL/โy ร โy/โz ร โz/โw<br> | |
| <br> | |
| For layer l:<br> | |
| ฮดหก = (W^(l+1))^T ฮด^(l+1) โ ฯ'(z^l)<br> | |
| โL/โW^l = ฮด^l (a^(l-1))^T | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ง Universal Training Method</div> | |
| <div class="box-content"> | |
| Every modern neural network uses backprop - from CNNs to Transformers to GANs | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ง Automatic Differentiation</div> | |
| <div class="box-content"> | |
| PyTorch, TensorFlow implement automatic backprop - you define forward pass, framework does backward | |
| </div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ค Probable Interview Questions</div> | |
| 1. What is the role of bias in a perceptron?<br> | |
| 2. Why can't we use MSE for classification?<br> | |
| 3. Difference between loss function and evaluation metric?<br> | |
| 4. Why is mini-batch GD preferred?<br> | |
| 5. Does backpropagation update weights?<br> | |
| 6. Can gradient descent work without backpropagation?<br> | |
| 7. What happens if learning rate is too high?<br> | |
| 8. How many times does forward propagation occur per epoch?<br> | |
| 9. What happens if we remove bias?<br> | |
| 10. What is the chain rule and why is it essential for backprop? | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The 4 Fundamental Equations of Backprop</h3> | |
| <p>Backpropagation is essentially the chain rule applied iteratively. We define the error signal ฮด = โL/โz.</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Error at Output Layer (L):</strong><br> | |
| ฮดแดธ = โโL โ ฯ'(zแดธ)<br> | |
| <span class="formula-caption">Example for MSE: (aแดธ - y) โ ฯ'(zแดธ)</span></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Error at Layer l (Backwards):</strong><br> | |
| ฮดหก = ((Wหกโบยน)แต ฮดหกโบยน) โ ฯ'(zหก)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Gradient w.r.t Bias:</strong><br> | |
| โL / โbหก = ฮดหก</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Gradient w.r.t Weights:</strong><br> | |
| โL / โWหก = ฮดหก (aหกโปยน)แต</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain Walkthrough</div> | |
| Suppose single neuron: z = wx + b, Loss L = (ฯ(z) - y)ยฒ/2<br> | |
| 1. <strong>Forward:</strong> z=2, a=ฯ(2)โ0.88, y=1, L=0.007<br> | |
| 2. <strong>Backward:</strong><br> | |
| โL/โa = (a-y) = -0.12<br> | |
| โa/โz = ฯ(z)(1-ฯ(z)) = 0.88 * 0.12 = 0.1056<br> | |
| ฮด = โL/โz = -0.12 * 0.1056 = -0.01267<br> | |
| <strong>โL/โw = ฮด * x</strong> | <strong>โL/โb = ฮด</strong> | |
| </div> | |
| ` | |
| }, | |
| "regularization": { | |
| overview: ` | |
| <h3>Regularization Techniques</h3> | |
| <table> | |
| <tr> | |
| <th>Method</th> | |
| <th>How It Works</th> | |
| <th>When to Use</th> | |
| </tr> | |
| <tr> | |
| <td>L2 (Ridge)</td> | |
| <td>Adds ฮปฮฃwยฒ to loss</td> | |
| <td>Keeps all features, reduces magnitude</td> | |
| </tr> | |
| <tr> | |
| <td>L1 (Lasso)</td> | |
| <td>Adds ฮปฮฃ|w| to loss</td> | |
| <td>Feature selection (zeros out weights)</td> | |
| </tr> | |
| <tr> | |
| <td>Dropout</td> | |
| <td>Randomly drops neurons (p=0.5 typical)</td> | |
| <td><strong>Most effective for deep networks</strong></td> | |
| </tr> | |
| <tr> | |
| <td>Early Stopping</td> | |
| <td>Stop when validation loss increases</td> | |
| <td>Prevents overfitting during training</td> | |
| </tr> | |
| <tr> | |
| <td>Data Augmentation</td> | |
| <td>Artificially expand dataset</td> | |
| <td>Computer vision (rotations, flips, crops)</td> | |
| </tr> | |
| </table> | |
| <h3>Weight Initialization</h3> | |
| <p>Proper initialization prevents vanishing/exploding gradients from the very first step.</p> | |
| <table> | |
| <tr><th>Method</th><th>Formula</th><th>Best For</th></tr> | |
| <tr><td>Zero Init</td><td>All w = 0</td><td>โ Never use! Breaks symmetry</td></tr> | |
| <tr><td>Random</td><td>w ~ N(0, 0.01)</td><td>โ ๏ธ Vanishes in deep nets</td></tr> | |
| <tr><td><strong>Xavier (Glorot)</strong></td><td>w ~ N(0, โ(2/(n_in + n_out)))</td><td>โ Sigmoid, Tanh</td></tr> | |
| <tr><td><strong>He (Kaiming)</strong></td><td>w ~ N(0, โ(2/n_in))</td><td>โ ReLU (default)</td></tr> | |
| </table> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Best Practices</div> | |
| <div class="box-content"> | |
| โข Start with Dropout (0.5) for hidden layers<br> | |
| โข Add L2 if still overfitting (ฮป=0.01, 0.001)<br> | |
| โข Always use Early Stopping<br> | |
| โข Data Augmentation for images | |
| </div> | |
| </div> | |
| <h3>Dropout vs Batch Normalization</h3> | |
| <table> | |
| <tr><th>Feature</th><th>Dropout</th><th>Batch Normalization</th></tr> | |
| <tr><td>Purpose</td><td>Regularization</td><td>Faster training + mild regularization</td></tr> | |
| <tr><td>Mechanism</td><td>Randomly drops neurons</td><td>Normalizes layer inputs</td></tr> | |
| <tr><td>Training vs Test</td><td>Different behavior</td><td>Different behavior</td></tr> | |
| <tr><td>Combined?</td><td colspan="2">Yes, use BatchNorm <em>before</em> Dropout</td></tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ค Probable Interview Questions</div> | |
| 1. Why can't we initialize all weights to zero?<br> | |
| 2. Difference between Xavier and He initialization?<br> | |
| 3. What is the vanishing gradient problem?<br> | |
| 4. How does Dropout prevent overfitting?<br> | |
| 5. Can we use Dropout at test time?<br> | |
| 6. Why is He initialization used with ReLU?<br> | |
| 7. What happens if weights are too large initially?<br> | |
| 8. Does Batch Normalization eliminate the need for Dropout?<br> | |
| 9. L1 vs L2 regularization โ when to use each?<br> | |
| 10. What is the exploding gradient problem and how to fix it? | |
| </div> | |
| `, | |
| math: ` | |
| <h3>L2 Regularization (Weight Decay)</h3> | |
| <p>Add a penalty proportional to the squared magnitude of weights.</p> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 25px; border-radius: 8px;"> | |
| <strong>L_total = L_data + ฮป ร ฮฃ wยฒ</strong> | |
| </div> | |
| <h4>Gradient with L2:</h4> | |
| <div class="formula"> | |
| โL_total/โw = โL_data/โw + 2ฮปw<br><br> | |
| Update rule becomes:<br> | |
| w = w - ฮฑ(โL + 2ฮปw) = w(1 - 2ฮฑฮป) - ฮฑโL | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: L2 Effect</div> | |
| <strong>Without L2:</strong> w = 5.0, โL = 0.1, ฮฑ = 0.1<br> | |
| w_new = 5.0 - 0.1 ร 0.1 = 4.99<br><br> | |
| <strong>With L2 (ฮป=0.01):</strong><br> | |
| w_new = 5.0 ร (1 - 2ร0.1ร0.01) - 0.1 ร 0.1<br> | |
| w_new = 5.0 ร 0.998 - 0.01 = 4.99 - 0.01 = <strong>4.98</strong><br><br> | |
| The weight shrinks faster! Large weights shrink most. | |
| </div> | |
| <h3>L1 Regularization (Lasso)</h3> | |
| <p>Adds penalty proportional to absolute value of weights - encourages sparsity.</p> | |
| <div class="formula"> | |
| L_total = L_data + ฮป ร ฮฃ |w|<br><br> | |
| Gradient: โL/โw = โL_data + ฮป ร sign(w) | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก L1 vs L2</div> | |
| โข <strong>L1:</strong> Creates sparse weights (many zeros) โ Feature selection<br> | |
| โข <strong>L2:</strong> Small but non-zero weights โ More stable<br> | |
| โข <strong>Elastic Net:</strong> ฮปโ|w| + ฮปโwยฒ (both!) | |
| </div> | |
| <h3>Dropout Mathematics</h3> | |
| <p>Randomly set neurons to zero with probability p during training.</p> | |
| <div class="formula"> | |
| <strong>Training:</strong><br> | |
| r ~ Bernoulli(1-p) [mask of 0s and 1s]<br> | |
| รฃ = a โ r [element-wise multiply]<br><br> | |
| <strong>Inference (scaling):</strong><br> | |
| รฃ = a ร (1-p) [scale by keep probability] | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Dropout Example</div> | |
| <strong>Layer output (4 neurons):</strong> a = [1.0, 2.0, 0.5, 1.5]<br> | |
| <strong>Dropout p = 0.5:</strong> r = [1, 0, 1, 0] (random mask)<br><br> | |
| <strong>Training output:</strong> รฃ = [1.0, 0, 0.5, 0]<br> | |
| <strong>Inference output:</strong> รฃ = [0.5, 1.0, 0.25, 0.75]<br><br> | |
| During inference, we scale by (1-p)=0.5 to maintain expected value! | |
| </div> | |
| <h3>Why Dropout Works</h3> | |
| <div class="formula"> | |
| Dropout โ Training an ensemble of 2โฟ sub-networks<br> | |
| (where n = number of neurons that can be dropped)<br><br> | |
| Each forward pass is a different architecture! | |
| </div> | |
| <h3>Weight Initialization Mathematics</h3> | |
| <h4>Xavier Initialization (for Sigmoid/Tanh)</h4> | |
| <div class="formula"> | |
| w ~ N(0, ฯยฒ) where ฯยฒ = 2 / (n_in + n_out)<br><br> | |
| Goal: Keep Var(output) โ Var(input) across layers | |
| </div> | |
| <h4>He Initialization (for ReLU)</h4> | |
| <div class="formula"> | |
| w ~ N(0, ฯยฒ) where ฯยฒ = 2 / n_in<br><br> | |
| ReLU zeros out ~50% of activations, so variance is halved โ multiply by 2 to compensate! | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Why Zero Init Fails</div> | |
| If all weights = 0, every neuron computes the <strong>same output</strong>.<br> | |
| All gradients are <strong>identical</strong> โ All weights update the same way.<br> | |
| Result: All neurons stay identical forever! The network is as good as <strong>1 neuron</strong>.<br><br> | |
| <strong>Random Init:</strong> w ~ N(0, 0.01) works for shallow networks but gradients shrink exponentially in deep ones.<br> | |
| <strong>Xavier:</strong> Calibrates variance based on layer width โ stable gradients for Sigmoid/Tanh.<br> | |
| <strong>He:</strong> Accounts for ReLU zeroing out negative half โ default for modern networks. | |
| </div> | |
| ` | |
| }, | |
| "batch-norm": { | |
| overview: ` | |
| <h3>Batch Normalization</h3> | |
| <p>Normalizes layer inputs to have mean=0 and variance=1, stabilizing and accelerating training.</p> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Benefits</div> | |
| โข <strong>Faster Training:</strong> Allows higher learning rates<br> | |
| โข <strong>Reduces Vanishing Gradients:</strong> Better gradient flow<br> | |
| โข <strong>Regularization Effect:</strong> Adds slight noise<br> | |
| โข <strong>Less Sensitive to Init:</strong> Reduces initialization impact | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The 4 Steps of Batch Normalization</h3> | |
| <p>Calculated per mini-batch B = {xโ, ..., xโ}:</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Mini-Batch Mean:</strong> ฮผ_B = (1/m) ฮฃ xแตข</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Mini-Batch Variance:</strong> ฯยฒ_B = (1/m) ฮฃ (xแตข - ฮผ_B)ยฒ</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Normalize:</strong> xฬแตข = (xแตข - ฮผ_B) / โ(ฯยฒ_B + ฮต)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Scale and Shift:</strong> yแตข = ฮณ xฬแตข + ฮฒ</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Why ฮณ and ฮฒ?</div> | |
| If we only normalized to (0,1), we might restrict the representation power of the network. <br> | |
| ฮณ and ฮฒ allow the network to <strong>undo</strong> the normalization if that's optimal: <br> | |
| If ฮณ = โ(ฯยฒ) and ฮฒ = ฮผ, we get the original data back! | |
| </div> | |
| ` | |
| }, | |
| "cv-intro": { | |
| overview: ` | |
| <h3>Why Computer Vision Needs Special Architectures</h3> | |
| <p><strong>Problem:</strong> Images have huge dimensionality</p> | |
| <ul> | |
| <li>224ร224 RGB image = 150,528 input features</li> | |
| <li>Fully connected layer with 1000 neurons = 150M parameters!</li> | |
| <li>Result: Overfitting, slow training, memory issues</li> | |
| </ul> | |
| <h3>Solution: Convolutional Neural Networks</h3> | |
| <ul> | |
| <li><strong>Weight Sharing:</strong> Same filter applied everywhere (1000x fewer parameters)</li> | |
| <li><strong>Local Connectivity:</strong> Neurons see small patches</li> | |
| <li><strong>Translation Invariance:</strong> Detect cat anywhere in image</li> | |
| </ul> | |
| `, | |
| concepts: ` | |
| <h3>Why CNNs Beat Fully Connected</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Parameter Efficiency:</strong> 1000ร fewer parameters through weight sharing</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Translation Equivariance:</strong> Same object โ same activation regardless of position</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ธ Real-World CV</div> | |
| <div class="box-content"> | |
| Face ID, medical imaging (MRI/CT), autonomous drone navigation, manufacturing defect detection, and satellite imagery analysis | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Parameter Explosion Problem</h3> | |
| <p>Why do standard Neural Networks fail on images? Let's calculate the parameters for a small image.</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: MLP vs Images</div> | |
| 1. **Input:** 224 ร 224 pixels with 3 color channels (RGB)<br> | |
| 2. **Input Size:** 224 ร 224 ร 3 = <strong>150,528 features</strong><br> | |
| 3. **Hidden Layer:** Suppose we want just 1000 neurons.<br> | |
| 4. **Matrix size:** [1000, 150528]<br> | |
| 5. **Total Weights:** 1000 ร 150528 โ <strong>150 Million parameters</strong> for just ONE layer! | |
| </div> | |
| <h3>The CNN Solution: Weight Sharing</h3> | |
| <p>Instead of every neuron looking at every pixel, we use <strong>translation invariance</strong>. If an edge detector works in the top-left, it should work in the bottom-right.</p> | |
| <div class="formula"> | |
| Total Params = (Kernel_H ร Kernel_W ร Input_Channels) ร Num_Filters<br> | |
| <br> | |
| For a 3x3 filter: (3 ร 3 ร 3) ร 64 = <strong>1,728 parameters</strong><br> | |
| Reduction: 150M / 1.7k โ <strong>86,000ร more efficient!</strong> | |
| </div> | |
| ` | |
| }, | |
| "pooling": { | |
| overview: ` | |
| <h3>Pooling Layers</h3> | |
| <p>Pooling reduces spatial dimensions while retaining important information.</p> | |
| <table> | |
| <tr> | |
| <th>Type</th> | |
| <th>Operation</th> | |
| <th>Use Case</th> | |
| </tr> | |
| <tr> | |
| <td>Max Pooling</td> | |
| <td>Take maximum value</td> | |
| <td><strong>Most common</strong> - preserves strong activations</td> | |
| </tr> | |
| <tr> | |
| <td>Average Pooling</td> | |
| <td>Take average</td> | |
| <td>Smoother, less common (used in final layers)</td> | |
| </tr> | |
| <tr> | |
| <td>Global Pooling</td> | |
| <td>Pool entire feature map</td> | |
| <td>Replace FC layers (reduces parameters)</td> | |
| </tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Benefits</div> | |
| โข Reduces spatial size (faster computation)<br> | |
| โข Adds translation invariance<br> | |
| โข Prevents overfitting<br> | |
| โข Typical: 2ร2 window, stride 2 (halves dimensions) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Pooling Mechanics</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Downsampling:</strong> Reduces HรW by pooling factor (typically 2ร)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>No Learnable Parameters:</strong> Fixed operation (max/average)</div> | |
| </div> | |
| <div class="formula"> | |
| Example: 4ร4 input โ 2ร2 max pooling โ 2ร2 output | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ Standard CNN Component</div> | |
| <div class="box-content"> | |
| Used after conv layers in AlexNet, VGG, and most classic CNNs to progressively reduce spatial dimensions | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Max Pooling: Winning Signal Selection</h3> | |
| <p>Pooling operations are non-parametric (no weights). They simply select or average values within a local window.</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: 2x2 Max Pooling</div> | |
| **Input (4x4):**<br> | |
| [1 3 | 2 1]<br> | |
| [5 1 | 0 2]<br> | |
| -----------<br> | |
| [1 1 | 8 2]<br> | |
| [0 2 | 4 1]<br> | |
| <br> | |
| **Output (2x2):**<br> | |
| Step 1: max(1, 3, 5, 1) = <strong>5</strong><br> | |
| Step 2: max(2, 1, 0, 2) = <strong>2</strong><br> | |
| Step 3: max(1, 1, 0, 2) = <strong>2</strong><br> | |
| Step 4: max(8, 2, 4, 1) = <strong>8</strong><br> | |
| **Final:** [5 2] / [2 8] | |
| </div> | |
| <h3>Backprop through Pooling</h3> | |
| <div class="list-item"> | |
| <div class="list-num">๐ก</div> | |
| <div><strong>Max Pooling:</strong> Gradient is routed ONLY to the neuron that had the maximum value. All others get 0.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">๐ก</div> | |
| <div><strong>Average Pooling:</strong> Gradient is distributed evenly among all neurons in the window.</div> | |
| </div> | |
| ` | |
| }, | |
| "cnn-basics": { | |
| overview: ` | |
| <h3>CNN Architecture Pattern</h3> | |
| <div class="formula"> | |
| Input โ [Conv โ ReLU โ Pool] ร N โ Flatten โ FC โ Softmax | |
| </div> | |
| <h3>Typical Layering Strategy</h3> | |
| <ul> | |
| <li><strong>Early Layers:</strong> Detect low-level features (edges, textures) - small filters (3ร3)</li> | |
| <li><strong>Middle Layers:</strong> Combine into patterns, parts - more filters, same size</li> | |
| <li><strong>Deep Layers:</strong> High-level concepts (faces, objects) - many filters</li> | |
| <li><strong>Final FC Layers:</strong> Classification based on learned features</li> | |
| </ul> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก Filter Progression</div> | |
| Layer 1: 32 filters (edges)<br> | |
| Layer 2: 64 filters (textures)<br> | |
| Layer 3: 128 filters (patterns)<br> | |
| Layer 4: 256 filters (parts)<br> | |
| Common pattern: double filters after each pooling | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Module Design Principles</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Spatial Reduction:</strong> Progressively downsample (224โ112โ56โ28...)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Channel Expansion:</strong> Increase filters as spatial dims decrease</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ All Modern Vision Models</div> | |
| <div class="box-content"> | |
| This pattern forms the backbone of ResNet, MobileNet, EfficientNet - fundamental CNN design | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>1. The Golden Formula for Output Size</h3> | |
| <p>Given Input (W), Filter Size (F), Padding (P), and Stride (S):</p> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0;"> | |
| Output Size = โ(W - F + 2P) / Sโ + 1 | |
| </div> | |
| <h3>2. Parameter Count Calculation</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Parameters PER Filter:</strong> (F ร F ร C_in) + 1 (bias)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Total Parameters:</strong> N_filters ร ((F ร F ร C_in) + 1)</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain Calculation</div> | |
| <strong>Input:</strong> 224x224x3 | <strong>Layer:</strong> 64 filters of 3x3 | <strong>Stride:</strong> 1 | <strong>Padding:</strong> 1<br> | |
| 1. <strong>Output Size:</strong> (224 - 3 + 2(1))/1 + 1 = 224 (Same Padding)<br> | |
| 2. <strong>Params:</strong> 64 * (3 * 3 * 3 + 1) = 64 * 28 = <strong>1,792 parameters</strong><br> | |
| 3. <strong>FLOPs:</strong> 224 * 224 * 1792 โ <strong>90 Million operations</strong> per image! | |
| </div> | |
| ` | |
| }, | |
| "viz-filters": { | |
| overview: ` | |
| <h3>What CNNs Learn</h3> | |
| <p>CNN filters automatically learn hierarchical visual features:</p> | |
| <h3>Layer-by-Layer Visualization</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Layer 1:</strong> Edges and colors (horizontal, vertical, diagonal lines)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Layer 2:</strong> Textures and patterns (corners, curves, simple shapes)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Layer 3:</strong> Object parts (eyes, wheels, windows)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Layer 4-5:</strong> Whole objects (faces, cars, animals)</div> | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Visualization Techniques</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Activation Maximization:</strong> Find input that maximizes filter response</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Grad-CAM:</strong> Highlight important regions for predictions</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Model Interpretability</div> | |
| <div class="box-content"> | |
| Understanding what CNNs learn helps debug failures, build trust, and improve architecture design | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐จ Art & Style Transfer</div> | |
| <div class="box-content"> | |
| Filter visualizations inspired neural style transfer (VGG features) | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Activation Maximization</h3> | |
| <p>Find the input x* that maximizes a specific neuron's activation.</p> | |
| <div class="formula" style="font-size: 1.2rem; text-align: center; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 25px; border-radius: 8px;"> | |
| <strong>x* = argmax_x [a_ij(x) - ฮป||x||ยฒ]</strong> | |
| </div> | |
| <h4>Gradient Ascent on Input:</h4> | |
| <div class="formula"> | |
| x_{t+1} = x_t + ฮฑ ร โa_ij/โx<br><br> | |
| Where:<br> | |
| โข a_ij = activation of neuron (i,j) in layer l<br> | |
| โข ฮฑ = step size<br> | |
| โข ฮป||x||ยฒ = regularization to keep input natural-looking | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Generate What a Filter "Sees"</div> | |
| <strong>Goal:</strong> Find image that maximally activates Conv1 filter #5<br><br> | |
| 1. Start with x = random noise (224ร224ร3)<br> | |
| 2. Forward pass โ get activation aโ at filter 5<br> | |
| 3. Backward pass: โaโ /โx (gradient of activation w.r.t. input)<br> | |
| 4. Update: x = x + 0.01 ร โaโ /โx<br> | |
| 5. Repeat 100-500 times<br><br> | |
| <strong>Result:</strong> Image showing what pattern the filter detects! | |
| </div> | |
| <h3>Grad-CAM (Gradient-weighted Class Activation Mapping)</h3> | |
| <p>Highlight which regions of the image were important for a specific class prediction.</p> | |
| <div class="formula" style="background: rgba(255, 107, 53, 0.08); padding: 20px; border-radius: 8px;"> | |
| <strong>Step 1 - Global Average Pool the Gradients:</strong><br> | |
| ฮฑโ = (1/Z) ร ฮฃแตข ฮฃโฑผ (โyแถ/โAโโฑสฒ)<br><br> | |
| <strong>Step 2 - Weighted Sum of Feature Maps:</strong><br> | |
| L_Grad-CAM = ReLU(ฮฃโ ฮฑโ ร Aโ)<br><br> | |
| Where:<br> | |
| โข yแถ = score for class c (before softmax)<br> | |
| โข Aโ = k-th feature map of last conv layer<br> | |
| โข ฮฑโ = importance weight of feature map k | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Grad-CAM Calculation</div> | |
| <strong>Last conv layer:</strong> 14ร14ร512 feature maps<br> | |
| <strong>Predicted class:</strong> "Dog" (class 5)<br><br> | |
| 1. Get gradient โyโ /โA for all 512 feature maps<br> | |
| 2. Average each gradient map: ฮฑโ = 0.8, ฮฑโ = 0.1, ฮฑโ = 0.5...<br> | |
| 3. Weighted sum: L = 0.8รAโ + 0.1รAโ + 0.5รAโ + ...<br> | |
| 4. Apply ReLU (keep positive contributions only)<br> | |
| 5. Upsample to input size (14ร14 โ 224ร224)<br><br> | |
| <strong>Result:</strong> Heatmap showing dog's face/body highlighted! | |
| </div> | |
| <h3>Saliency Maps (Vanilla Gradient)</h3> | |
| <div class="formula"> | |
| Saliency(x) = |โyแถ/โx|<br><br> | |
| Take absolute value of gradient of class score w.r.t. input pixels. | |
| </div> | |
| ` | |
| }, | |
| "lenet": { | |
| overview: ` | |
| <h3>LeNet-5 (1998) - The Pioneer</h3> | |
| <p>First successful CNN for digit recognition (MNIST). Introduced the Conv โ Pool โ Conv โ Pool pattern still used today.</p> | |
| <h3>Architecture</h3> | |
| <div class="formula"> | |
| Input 32ร32 โ Conv(6 filters, 5ร5) โ AvgPool โ Conv(16 filters, 5ร5) โ AvgPool โ FC(120) โ FC(84)โ FC(10) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Historical Impact</div> | |
| โข Used by US Postal Service for zip code recognition<br> | |
| โข Proved CNNs work for real-world tasks<br> | |
| โข Template for modern architectures | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Key Innovations</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Layered Architecture:</strong> Hierarchical feature extraction</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Shared Weights:</strong> Convolutional parameter sharing</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">โ๏ธ Handwriting Recognition</div> | |
| <div class="box-content"> | |
| USPS mail sorting, check processing, form digitization | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Educational Foundation</div> | |
| <div class="box-content"> | |
| Perfect starting point for learning CNNs - simple enough to understand, complex enough to be useful | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>LeNet-5 Complete Dimension Walkthrough</h3> | |
| <p>Follow the tensor shapes through each layer of LeNet-5.</p> | |
| <div class="formula" style="font-size: 1.0rem; margin: 20px 0; background: rgba(0, 212, 255, 0.08); padding: 20px; border-radius: 8px;"> | |
| <strong>Output Size Formula:</strong><br> | |
| W_out = (W_in - K + 2P) / S + 1<br> | |
| Where K=kernel, P=padding, S=stride | |
| </div> | |
| <h4>Layer-by-Layer Calculation:</h4> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div> | |
| <strong>Input:</strong> 32 ร 32 ร 1 (grayscale MNIST)<br> | |
| <span class="formula-caption">Original: 28ร28, padded to 32ร32</span> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div> | |
| <strong>C1 - Conv:</strong> 6 filters, 5ร5, stride 1, no padding<br> | |
| Output: (32 - 5 + 0)/1 + 1 = <strong>28 ร 28 ร 6</strong><br> | |
| Params: (5ร5ร1 + 1) ร 6 = <strong>156</strong> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div> | |
| <strong>S2 - AvgPool:</strong> 2ร2, stride 2<br> | |
| Output: 28/2 = <strong>14 ร 14 ร 6</strong><br> | |
| Params: 0 (or 12 with learnable coefficients) | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div> | |
| <strong>C3 - Conv:</strong> 16 filters, 5ร5<br> | |
| Output: (14 - 5)/1 + 1 = <strong>10 ร 10 ร 16</strong><br> | |
| Params: (5ร5ร6 + 1) ร 16 = <strong>2,416</strong> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div> | |
| <strong>S4 - AvgPool:</strong> 2ร2, stride 2<br> | |
| Output: 10/2 = <strong>5 ร 5 ร 16</strong><br> | |
| Params: 0 | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">06</div> | |
| <div> | |
| <strong>C5 - Conv:</strong> 120 filters, 5ร5<br> | |
| Output: (5 - 5)/1 + 1 = <strong>1 ร 1 ร 120</strong><br> | |
| Params: (5ร5ร16 + 1) ร 120 = <strong>48,120</strong> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">07</div> | |
| <div> | |
| <strong>F6 - FC:</strong> 120 โ 84<br> | |
| Params: 120 ร 84 + 84 = <strong>10,164</strong> | |
| </div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">08</div> | |
| <div> | |
| <strong>Output - FC:</strong> 84 โ 10<br> | |
| Params: 84 ร 10 + 10 = <strong>850</strong> | |
| </div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Total Parameter Count</div> | |
| <strong>Convolutional Layers:</strong><br> | |
| C1: 156 + C3: 2,416 + C5: 48,120 = 50,692<br><br> | |
| <strong>Fully Connected Layers:</strong><br> | |
| F6: 10,164 + Output: 850 = 11,014<br><br> | |
| <strong>Total:</strong> โ <strong>61,706 parameters</strong><br><br> | |
| Compare to AlexNet's 60 million - LeNet is 1000ร smaller! | |
| </div> | |
| <h3>Receptive Field Calculation</h3> | |
| <div class="formula"> | |
| After C1 (5ร5): Each neuron sees 5ร5 pixels<br> | |
| After S2: Sees 6ร6 pixels (pooling expands RF)<br> | |
| After C3: Sees 14ร14 pixels<br> | |
| After S4: Sees 16ร16 pixels<br> | |
| After C5: <strong>Sees entire 32ร32 input!</strong> | |
| </div> | |
| ` | |
| }, | |
| "alexnet": { | |
| overview: ` | |
| <h3>AlexNet (2012) - The Deep Learning Revolution</h3> | |
| <p>Won ImageNet 2012 by huge margin (15.3% vs 26.2% error), igniting the deep learning revolution.</p> | |
| <h3>Key Innovations</h3> | |
| <ul> | |
| <li><strong>ReLU Activation:</strong> Faster training than sigmoid/tanh</li> | |
| <li><strong>Dropout:</strong> Prevents overfitting (p=0.5)</li> | |
| <li><strong>Data Augmentation:</strong> Random crops/flips</li> | |
| <li><strong>GPU Training:</strong> Used 2 GTX580 GPUs</li> | |
| <li><strong>Deep:</strong> 8 layers (5 conv + 3 FC), 60M parameters</li> | |
| </ul> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Why So Important?</div> | |
| First to show that deeper networks + more data + GPU compute = breakthrough performance | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Technical Contributions</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>ReLU:</strong> Solved vanishing gradients, enabled deeper networks</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Dropout:</strong> First major regularization for deep nets</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฏ ImageNet Challenge</div> | |
| <div class="box-content"> | |
| Shattered records on 1000-class classification, proving deep learning superiority | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Industry Catalyst</div> | |
| <div class="box-content"> | |
| Sparked AI renaissance - Google, Facebook, Microsoft pivoted to deep learning after AlexNet | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Paper & Pain: Parameter Counting</h3> | |
| <p>Understanding AlexNet's 60M parameters:</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Conv Layers:</strong> Only ~2.3 Million parameters. They do most of the work with small memory!</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>FC Layers:</strong> Over **58 Million parameters**. The first FC layer (FC6) takes 4096 * (6*6*256) โ 37M params!</div> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ The Design Flaw</div> | |
| FC layers are the memory bottleneck. Modern models (ResNet, Inception) replace these with Global Average Pooling to save 90% parameters. | |
| </div> | |
| ` | |
| }, | |
| "vgg": { | |
| overview: ` | |
| <h3>VGGNet (2014) - The Power of Depth</h3> | |
| <p>VGG showed that depth matters - 16-19 layers using only small 3ร3 filters.</p> | |
| `, | |
| concepts: ` | |
| <h3>Small Filters, Receptive Field</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Uniformity:</strong> Uses 3ร3 filters everywhere with stride 1, padding 1.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Pooling Pattern:</strong> 2ร2 max pooling after every 2-3 conv layers.</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The 5ร5 vs 3ร3+3ร3 Equivalence</h3> | |
| <p>Why stack 3x3 filters instead of one large filter?</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Paramount Efficiency</div> | |
| 1. **Receptive Field:** Two 3x3 layers cover 5x5 area. Three 3x3 layers cover 7x7 area.<br> | |
| 2. **Param Count (C filters):**<br> | |
| โข One 7x7 layer: 7ยฒ ร Cยฒ = 49Cยฒ parameters.<br> | |
| โข Three 3x3 layers: 3 ร (3ยฒ ร Cยฒ) = 27Cยฒ parameters.<br> | |
| **Result:** 45% reduction in weights for the SAME "view" of the image! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ผ๏ธ Feature Backbone</div> | |
| VGG is the preferred architectural backbone for Neural Style Transfer and early GANs due to its simple, clean feature extraction properties. | |
| </div> | |
| ` | |
| }, | |
| "resnet": { | |
| overview: ` | |
| <h3>ResNet (2015) - Residual Connections</h3> | |
| <p><strong>Problem:</strong> Very deep networks (>20 layers) had degradation - training accuracy got worse!</p> | |
| <h3>Solution: Skip Connections</h3> | |
| <div class="formula"> | |
| Instead of learning H(x), learn residual F(x) = H(x) - x<br> | |
| Output: y = F(x) + x (shortcut connection) | |
| </div> | |
| <h3>Why Skip Connections Work</h3> | |
| <ul> | |
| <li><strong>Gradient Flow:</strong> Gradients flow directly through shortcuts</li> | |
| <li><strong>Identity Mapping:</strong> Easy to learn identity (just set F(x)=0)</li> | |
| <li><strong>Feature Reuse:</strong> Earlier features directly available to later layers</li> | |
| </ul> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ Impact</div> | |
| โข Enabled training of 152-layer networks (even 1000+ layers)<br> | |
| โข Won ImageNet 2015<br> | |
| โข Skip connections now used everywhere (U-Net, Transformers, etc.) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Identity & Projection Shortcuts</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Identity Shortcut:</strong> Used when dimensions match. y = F(x, {W}) + x</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Projection Shortcut (1ร1 Conv):</strong> Used when dimensions change. y = F(x, {W}) + W_s x</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Vanishing Gradient Solution</h3> | |
| <p>Why do skip connections help? Let's differentiate the output y = F(x) + x:</p> | |
| <div class="formula"> | |
| โy/โx = โF/โx + 1 | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Gradient Flow</div> | |
| The "+1" term acts as a **gradient highway**. Even if the weights in F(x) are small (causing โF/โx โ 0), the gradient can still flow through the +1 term. <br> | |
| This prevents the gradient from vanishing even in networks with 1000+ layers! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐๏ธ Modern Vision Backbones</div> | |
| <div class="box-content">ResNet is the default starting point for nearly all computer vision tasks today (Mask R-CNN, YOLO, etc.).</div> | |
| </div> | |
| ` | |
| }, | |
| "inception": { | |
| overview: ` | |
| <h3>Inception/GoogLeNet (2014) - Going Wider</h3> | |
| <p>Instead of going deeper, Inception modules go wider - using multiple filter sizes in parallel.</p> | |
| <h3>Inception Module</h3> | |
| <div class="formula"> | |
| Input โ [1ร1 conv] โ [3ร3 conv] โ [5ร5 conv] โ [3ร3 pool] โ Concatenate | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Core Innovations</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>1ร1 Bottlenecks:</strong> Dimensionality reduction before expensive convolutions.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Auxiliary Classifiers:</strong> Used during training to combat gradient vanishing in middle layers.</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>1ร1 Convolution Math (Network-in-Network)</h3> | |
| <p>A 1ร1 convolution acts like a channel-wise MLP. It maps input channels C to output channels C' using 1ร1รC parameters per filter.</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Compression</div> | |
| Input: 28x28x256 | Target: 28x28x512 with 3x3 Filters.<br> | |
| **Direct:** 512 * (3*3*256) โ 1.1 Million params.<br> | |
| **Inception (1x1 bottleneck to 64):**<br> | |
| Step 1 (1x1): 64 * (1*1*256) = 16k params.<br> | |
| Step 2 (3x3): 512 * (3*3*64) = 294k params.<br> | |
| **Total:** 310k params. **~3.5ร reduction in parameters!** | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐๏ธ Computational Efficiency</div> | |
| Inception designs are optimized for running deep networks on limited compute budgets. | |
| </div> | |
| ` | |
| }, | |
| "mobilenet": { | |
| overview: ` | |
| <h3>MobileNet - CNNs for Mobile Devices</h3> | |
| <p>Designed for mobile/embedded vision using depthwise separable convolutions.</p> | |
| <h3>Depthwise Separable Convolution</h3> | |
| <div class="formula"> | |
| Standard Conv = Depthwise Conv + Pointwise (1ร1) Conv | |
| </div> | |
| <h3>Computation Reduction</h3> | |
| <table> | |
| <tr> | |
| <th>Method</th> | |
| <th>Parameters</th> | |
| <th>FLOPs</th> | |
| </tr> | |
| <tr> | |
| <td>Standard 3ร3 Conv</td> | |
| <td>3ร3รC_inรC_out</td> | |
| <td>High</td> | |
| </tr> | |
| <tr> | |
| <td>Depthwise Separable</td> | |
| <td>3ร3รC_in + C_inรC_out</td> | |
| <td><strong>8-9ร less!</strong></td> | |
| </tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Applications</div> | |
| โข Real-time mobile apps (camera filters, AR)<br> | |
| โข Edge devices (drones, IoT)<br> | |
| โข Latency-critical systems<br> | |
| โข Good accuracy with 10-20ร speedup | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Efficiency Factors</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Width Multiplier (ฮฑ):</strong> Thins the network by reducing channels.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Resolution Multiplier (ฯ):</strong> Reduces input image size.</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Depthwise Separable Math</h3> | |
| <p>Standard convolution complexity: Fยฒ ร C_in ร C_out ร H ร W</p> | |
| <p>Separable complexity: (Fยฒ ร C_in + C_in ร C_out) ร H ร W</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: The 9ร Speedup</div> | |
| Reduction ratio is roughly: 1/C_out + 1/Fยฒ. <br> | |
| For 3x3 filters (F=3): Reduction is roughly **1/9th** the computation of standard conv! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฑ Edge Devices</div> | |
| <div class="box-content">Real-time object detection on smartphones, web browsers (TensorFlow.js), and IoT devices.</div> | |
| </div> | |
| ` | |
| }, | |
| "transfer-learning": { | |
| overview: ` | |
| <h3>Transfer Learning - Don't Train from Scratch!</h3> | |
| <p>Use pre-trained models (ImageNet) as feature extractors for your custom task.</p> | |
| <h3>Two Strategies</h3> | |
| <table> | |
| <tr> | |
| <th>Approach</th> | |
| <th>When to Use</th> | |
| <th>How</th> | |
| </tr> | |
| <tr> | |
| <td>Feature Extraction</td> | |
| <td><strong>Small dataset</strong> (<10K images)</td> | |
| <td>Freeze all layers, train only final FC layer</td> | |
| </tr> | |
| <tr> | |
| <td>Fine-tuning</td> | |
| <td><strong>Medium dataset</strong> (10K-100K)</td> | |
| <td>Freeze early layers, train last few + FC</td> | |
| </tr> | |
| <tr> | |
| <td>Full Training</td> | |
| <td><strong>Large dataset</strong> (>1M images)</td> | |
| <td>Use pre-trained as initialization, train all</td> | |
| </tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Best Practices</div> | |
| โข Use pre-trained models when dataset < 100K images<br> | |
| โข Start with low learning rate (1e-4) for fine-tuning<br> | |
| โข Popular backbones: ResNet50, EfficientNet, ViT | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Why Transfer Learning Works</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Feature Hierarchy:</strong> Early layers learn universal features (edges, textures) that transfer across domains</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Domain Similarity:</strong> The more similar source and target domains, the better transfer</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Regularization Effect:</strong> Pre-trained weights act as strong priors, preventing overfitting</div> | |
| </div> | |
| <h3>Transfer Learning Quadrant</h3> | |
| <table> | |
| <tr> | |
| <th></th> | |
| <th>Similar Domain</th> | |
| <th>Different Domain</th> | |
| </tr> | |
| <tr> | |
| <td><strong>Large Data</strong></td> | |
| <td>Fine-tune all layers</td> | |
| <td>Fine-tune top layers</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Small Data</strong></td> | |
| <td>Feature extraction</td> | |
| <td>Feature extraction (risky)</td> | |
| </tr> | |
| </table> | |
| `, | |
| math: ` | |
| <h3>Learning Rate Strategies</h3> | |
| <p>Different layers need different learning rates during fine-tuning.</p> | |
| <div class="formula"> | |
| Discriminative Fine-tuning:<br> | |
| lr_layer_n = lr_base ร decay^(L-n)<br> | |
| <br> | |
| Where L = total layers, n = layer index<br> | |
| Example: lr_base=1e-3, decay=0.9<br> | |
| Layer 1: 1e-3 ร 0.9^9 โ 3.9e-4<br> | |
| Layer 10: 1e-3 ร 0.9^0 = 1e-3 | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Domain Shift</div> | |
| When source and target distributions differ:<br> | |
| โข <strong>Covariate Shift:</strong> P(X) changes, P(Y|X) same<br> | |
| โข <strong>Label Shift:</strong> P(Y) changes, P(X|Y) same<br> | |
| โข <strong>Concept Shift:</strong> P(Y|X) changes<br> | |
| Transfer learning handles covariate shift well but struggles with concept shift. | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Medical Imaging</div> | |
| <div class="box-content"> | |
| Train on ImageNet, fine-tune for X-ray diagnosis with only 1000 labeled images. Achieves 90%+ accuracy vs 60% from scratch. | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Retail & E-commerce</div> | |
| <div class="box-content"> | |
| Product classification, visual search, inventory management using pre-trained ResNet/EfficientNet models. | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Satellite Imagery</div> | |
| <div class="box-content"> | |
| Land use classification, deforestation detection, urban planning using models pre-trained on aerial imagery. | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "localization": { | |
| overview: ` | |
| <h3>Object Localization</h3> | |
| <p>Predict both class and bounding box for a single object in image.</p> | |
| <h3>Multi-Task Loss</h3> | |
| <div class="formula"> | |
| Total Loss = L_classification + ฮป ร L_bbox<br> | |
| <br> | |
| Where:<br> | |
| L_classification = Cross-Entropy<br> | |
| L_bbox = Smooth L1 or IoU loss<br> | |
| ฮป = balance term (typically 1-10) | |
| </div> | |
| <h3>Bounding Box Representation</h3> | |
| <ul> | |
| <li><strong>Option 1:</strong> (x_min, y_min, x_max, y_max)</li> | |
| <li><strong>Option 2:</strong> (x_center, y_center, width, height) โ Most common</li> | |
| </ul> | |
| `, | |
| concepts: ` | |
| <h3>Localization vs Detection</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Classification:</strong> What is in the image? โ "Cat"</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Localization:</strong> Where is the single object? โ "Cat at [100, 50, 200, 150]"</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Detection:</strong> Where are ALL objects? โ Multiple bounding boxes</div> | |
| </div> | |
| <h3>Network Architecture</h3> | |
| <p>Modify a classification network (ResNet, VGG) by adding a regression head:</p> | |
| <div class="formula"> | |
| CNN Backbone โ Feature Map โ [Classification Head (1000 classes)]<br> | |
| โ [Regression Head (4 coordinates)] | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Smooth L1 Loss (Huber Loss)</h3> | |
| <p>Combines L1 and L2 loss for robust bounding box regression.</p> | |
| <div class="formula"> | |
| SmoothL1(x) = { 0.5xยฒ if |x| < 1<br> | |
| { |x| - 0.5 otherwise | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Why Smooth L1?</div> | |
| โข <strong>L2 Loss:</strong> Penalizes large errors too much (squared), sensitive to outliers<br> | |
| โข <strong>L1 Loss:</strong> Robust to outliers but has discontinuous gradient at 0<br> | |
| โข <strong>Smooth L1:</strong> Best of both worlds - quadratic near 0, linear for large errors | |
| </div> | |
| <h3>IoU Loss</h3> | |
| <div class="formula"> | |
| L_IoU = 1 - IoU(pred, target)<br> | |
| Where IoU = Intersection / Union | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Self-Driving Cars</div> | |
| <div class="box-content">Localize the primary vehicle ahead for adaptive cruise control</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ธ Photo Auto-Crop</div> | |
| <div class="box-content">Detect main subject and automatically crop to optimal composition</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Medical Imaging</div> | |
| <div class="box-content">Localize tumors, organs, or anomalies in X-rays and CT scans</div> | |
| </div> | |
| ` | |
| }, | |
| "rcnn": { | |
| overview: ` | |
| <h3>R-CNN Family Evolution</h3> | |
| <table> | |
| <tr> | |
| <th>Model</th> | |
| <th>Year</th> | |
| <th>Speed (FPS)</th> | |
| <th>Key Innovation</th> | |
| </tr> | |
| <tr> | |
| <td>R-CNN</td> | |
| <td>2014</td> | |
| <td>0.05</td> | |
| <td>Selective Search + CNN features</td> | |
| </tr> | |
| <tr> | |
| <td>Fast R-CNN</td> | |
| <td>2015</td> | |
| <td>0.5</td> | |
| <td>RoI Pooling (share conv features)</td> | |
| </tr> | |
| <tr> | |
| <td>Faster R-CNN</td> | |
| <td>2015</td> | |
| <td>7</td> | |
| <td>Region Proposal Network (RPN)</td> | |
| </tr> | |
| <tr> | |
| <td>Mask R-CNN</td> | |
| <td>2017</td> | |
| <td>5</td> | |
| <td>+ Instance Segmentation masks</td> | |
| </tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก When to Use</div> | |
| Faster R-CNN: Best accuracy for detection (not real-time)<br> | |
| Mask R-CNN: Detection + instance segmentation | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Two-Stage Detection Pipeline</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Stage 1 - Region Proposal:</strong> Find ~2000 candidate regions that might contain objects</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Stage 2 - Classification:</strong> Classify each region and refine bounding box</div> | |
| </div> | |
| <h3>Region Proposal Network (RPN)</h3> | |
| <p>The key innovation of Faster R-CNN - learns to propose regions instead of using hand-crafted algorithms.</p> | |
| <div class="formula"> | |
| RPN Output per location:<br> | |
| โข k anchor boxes ร 4 coordinates = 4k regression outputs<br> | |
| โข k anchor boxes ร 2 objectness scores = 2k classification outputs<br> | |
| Typical k = 9 (3 scales ร 3 aspect ratios) | |
| </div> | |
| `, | |
| math: ` | |
| <h3>RoI Pooling: Fixed-Size Feature Maps</h3> | |
| <p>Convert variable-size regions into fixed 7ร7 feature maps for FC layers.</p> | |
| <div class="formula"> | |
| For each RoI of size HรW:<br> | |
| 1. Divide into 7ร7 grid (cells of size H/7 ร W/7)<br> | |
| 2. Max-pool each cell โ single value<br> | |
| 3. Output: Fixed 7ร7 feature map regardless of input size | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: RoI Align vs RoI Pool</div> | |
| <strong>Problem:</strong> RoI Pooling quantizes coordinates, causing misalignment.<br> | |
| <strong>Solution:</strong> RoI Align uses bilinear interpolation instead of rounding.<br> | |
| This is critical for Mask R-CNN where pixel-level accuracy matters! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Medical Imaging</div> | |
| <div class="box-content">High-accuracy tumor detection where speed is less critical than precision</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ท Photo Analysis</div> | |
| <div class="box-content">Face detection, scene understanding, object counting in static images</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ Scientific Research</div> | |
| <div class="box-content">Cell detection, particle tracking, microscopy image analysis</div> | |
| </div> | |
| ` | |
| }, | |
| "ssd": { | |
| overview: ` | |
| <h3>SSD (Single Shot MultiBox Detector)</h3> | |
| <p>Balances speed and accuracy by predicting boxes at multiple scales.</p> | |
| <h3>Key Ideas</h3> | |
| <ul> | |
| <li><strong>Multi-Scale:</strong> Predictions from different layers (early = small objects, deep = large)</li> | |
| <li><strong>Default Boxes (Anchors):</strong> Pre-defined boxes of various aspects ratios</li> | |
| <li><strong>Single Pass:</strong> No separate region proposal step</li> | |
| </ul> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Performance</div> | |
| SSD300: 59 FPS, 74.3% mAP<br> | |
| SSD512: 22 FPS, 76.8% mAP<br> | |
| <br> | |
| Sweet spot between YOLO (faster) and Faster R-CNN (more accurate) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Multi-Scale Feature Maps</h3> | |
| <p>SSD makes predictions at multiple layers, each detecting objects at different scales.</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Early Layers (38ร38):</strong> Detect small objects (high resolution)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Middle Layers (19ร19, 10ร10):</strong> Detect medium objects</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Deep Layers (5ร5, 3ร3, 1ร1):</strong> Detect large objects</div> | |
| </div> | |
| <h3>Default Boxes (Anchors)</h3> | |
| <p>At each feature map cell, SSD predicts offsets for k default boxes with different aspect ratios (1:1, 2:1, 1:2, 3:1, 1:3).</p> | |
| `, | |
| math: ` | |
| <h3>SSD Loss Function</h3> | |
| <p>Weighted sum of localization and confidence losses.</p> | |
| <div class="formula"> | |
| L(x, c, l, g) = (1/N) ร [L_conf(x, c) + ฮฑ ร L_loc(x, l, g)]<br> | |
| <br> | |
| Where:<br> | |
| โข L_conf = Softmax loss over class confidences<br> | |
| โข L_loc = Smooth L1 loss over box coordinates<br> | |
| โข ฮฑ = Weight factor (typically 1)<br> | |
| โข N = Number of matched default boxes | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Hard Negative Mining</div> | |
| Problem: Most default boxes are background (class imbalance).<br> | |
| Solution: Sort negative boxes by confidence loss, pick top ones so pos:neg = 1:3.<br> | |
| This focuses training on hard negatives, not easy ones. | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐น Video Analytics</div> | |
| <div class="box-content">Real-time object detection in security cameras, sports broadcasting</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ค Robotics</div> | |
| <div class="box-content">Object detection for manipulation tasks, obstacle avoidance</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฑ Mobile Apps</div> | |
| <div class="box-content">Lightweight models for on-device detection (MobileNet-SSD)</div> | |
| </div> | |
| ` | |
| }, | |
| "semantic-seg": { | |
| overview: ` | |
| <h3>Semantic Segmentation</h3> | |
| <p>Classify every pixel in the image (pixel-wise classification).</p> | |
| <h3>Popular Architectures</h3> | |
| <table> | |
| <tr> | |
| <th>Model</th> | |
| <th>Key Feature</th> | |
| </tr> | |
| <tr> | |
| <td>FCN</td> | |
| <td>Fully Convolutional (no FC layers)</td> | |
| </tr> | |
| <tr> | |
| <td>U-Net</td> | |
| <td>Skip connections from encoder to decoder</td> | |
| </tr> | |
| <tr> | |
| <td>DeepLab</td> | |
| <td>Atrous (dilated) convolutions + ASPP</td> | |
| </tr> | |
| </table> | |
| <div class="formula"> | |
| U-Net Pattern:<br> | |
| Input โ Encoder (downsample) โ Bottleneck โ Decoder (upsample) โ Pixel-wise Output<br> | |
| With skip connections from encoder to decoder at each level | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Key Concepts</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Encoder-Decoder:</strong> Downsample to capture context, upsample to recover spatial detail</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Skip Connections:</strong> Pass high-resolution features from encoder to decoder (U-Net)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Atrous Convolution:</strong> Expand receptive field without losing resolution (DeepLab)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>ASPP:</strong> Atrous Spatial Pyramid Pooling - capture multi-scale context</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Dice Loss for Segmentation</h3> | |
| <p>Better than cross-entropy for imbalanced classes (small objects).</p> | |
| <div class="formula"> | |
| Dice = 2 ร |A โฉ B| / (|A| + |B|)<br> | |
| Dice Loss = 1 - Dice<br> | |
| <br> | |
| Where A = predicted mask, B = ground truth mask | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Why Dice > Cross-Entropy?</div> | |
| If only 1% of pixels are foreground:<br> | |
| โข Cross-Entropy: Model can get 99% accuracy by predicting all background!<br> | |
| โข Dice: Penalizes missed foreground pixels heavily<br> | |
| โข Often use combination: L = BCE + Dice | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฅ Medical Imaging</div> | |
| <div class="box-content">Tumor segmentation, organ delineation, cell analysis</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Autonomous Driving</div> | |
| <div class="box-content">Road segmentation, free space detection, drivable area</div> | |
| </div> | |
| ` | |
| }, | |
| "instance-seg": { | |
| overview: ` | |
| <h3>Instance Segmentation</h3> | |
| <p>Detect AND segment each individual object (combines object detection + semantic segmentation).</p> | |
| <h3>Difference from Semantic Segmentation</h3> | |
| <ul> | |
| <li><strong>Semantic:</strong> All "person" pixels get same label</li> | |
| <li><strong>Instance:</strong> Person #1, Person #2, Person #3 (separate instances)</li> | |
| </ul> | |
| <h3>Main Approach: Mask R-CNN</h3> | |
| <div class="formula"> | |
| Faster R-CNN + Segmentation Branch<br> | |
| <br> | |
| For each RoI:<br> | |
| 1. Bounding box regression<br> | |
| 2. Class prediction<br> | |
| 3. <strong>Binary mask for the object</strong> | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Mask R-CNN Architecture</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Backbone:</strong> ResNet-50/101 with Feature Pyramid Network (FPN)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>RPN:</strong> Region Proposal Network (same as Faster R-CNN)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>RoI Align:</strong> Better than RoI Pooling (no quantization)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Mask Head:</strong> Small FCN that outputs 28ร28 binary mask per class</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Multi-Task Loss</h3> | |
| <p>Mask R-CNN optimizes three losses simultaneously:</p> | |
| <div class="formula"> | |
| L = L_cls + L_box + L_mask<br> | |
| <br> | |
| Where:<br> | |
| โข L_cls = Classification loss (cross-entropy)<br> | |
| โข L_box = Bounding box regression (smooth L1)<br> | |
| โข L_mask = Binary cross-entropy per-pixel mask loss | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Key Insight: Decoupled Masks</div> | |
| Mask R-CNN predicts a binary mask for EACH class independently.<br> | |
| This avoids competition between classes and improves accuracy. | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ธ Photo Editing</div> | |
| <div class="box-content">Auto-select objects for editing, background removal, composition</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ค Robotics</div> | |
| <div class="box-content">Object manipulation - need exact shape, not just bounding box</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ Video Production</div> | |
| <div class="box-content">Rotoscoping, VFX, green screen replacement</div> | |
| </div> | |
| ` | |
| }, | |
| "face-recog": { | |
| overview: ` | |
| <h3>Face Recognition with Siamese Networks</h3> | |
| <p>Learn similarity between faces using metric learning instead of classification.</p> | |
| <h3>Triplet Loss Training</h3> | |
| <div class="formula"> | |
| Loss = max(||f(A) - f(P)||ยฒ - ||f(A) - f(N)||ยฒ + margin, 0)<br> | |
| <br> | |
| Where:<br> | |
| A = Anchor (reference face)<br> | |
| P = Positive (same person)<br> | |
| N = Negative (different person)<br> | |
| margin = minimum separation (e.g., 0.2) | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก One-Shot Learning</div> | |
| After training, recognize new people with just 1-2 photos!<br> | |
| No retraining needed - just compare embeddings. | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Face Recognition Pipeline</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Face Detection:</strong> Find faces in image (MTCNN, RetinaFace)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Alignment:</strong> Normalize face orientation and scale</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Embedding:</strong> Extract 128/512-dim feature vector (FaceNet, ArcFace)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Matching:</strong> Compare embeddings with cosine similarity or L2 distance</div> | |
| </div> | |
| <h3>Key Models</h3> | |
| <table> | |
| <tr><th>Model</th><th>Key Innovation</th></tr> | |
| <tr><td>FaceNet</td><td>Triplet loss, 128-dim embedding</td></tr> | |
| <tr><td>ArcFace</td><td>Additive angular margin loss, SOTA accuracy</td></tr> | |
| <tr><td>DeepFace</td><td>Facebook's early success</td></tr> | |
| </table> | |
| `, | |
| math: ` | |
| <h3>Triplet Loss Intuition</h3> | |
| <p>Push same-person faces closer, different-person faces apart.</p> | |
| <div class="formula"> | |
| ||f(A) - f(P)||ยฒ + margin < ||f(A) - f(N)||ยฒ | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Hard Triplet Mining</div> | |
| Easy triplets: Random selection - margin already satisfied, loss=0<br> | |
| Hard triplets: Find P closest to anchor, N closest to anchor from different class<br> | |
| <strong>Training on hard triplets is critical for convergence!</strong> | |
| </div> | |
| <h3>ArcFace Angular Margin</h3> | |
| <div class="formula"> | |
| L = -log(e^(sยทcos(ฮธ + m)) / (e^(sยทcos(ฮธ + m)) + ฮฃ e^(sยทcos(ฮธ_j))))<br> | |
| Where m = angular margin, s = scale factor | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฑ Phone Unlock</div> | |
| <div class="box-content">Face ID, biometric authentication</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Security</div> | |
| <div class="box-content">Access control, surveillance, identity verification</div> | |
| </div> | |
| ` | |
| }, | |
| "autoencoders": { | |
| overview: ` | |
| <h3>Autoencoders</h3> | |
| <p>Unsupervised learning to compress data into latent representation and reconstruct it.</p> | |
| <h3>Architecture</h3> | |
| <div class="formula"> | |
| Input โ Encoder โ Latent Code (bottleneck) โ Decoder โ Reconstruction<br> | |
| <br> | |
| Loss = ||Input - Reconstruction||ยฒ (MSE) | |
| </div> | |
| <h3>Variants</h3> | |
| <ul> | |
| <li><strong>Vanilla:</strong> Basic autoencoder</li> | |
| <li><strong>Denoising:</strong> Input corrupted, output clean (learns robust features)</li> | |
| <li><strong>Variational (VAE):</strong> Probabilistic latent space (for generation)</li> | |
| <li><strong>Sparse:</strong> Encourage sparse activations</li> | |
| </ul> | |
| `, | |
| concepts: ` | |
| <h3>Key Concepts</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Bottleneck:</strong> Force information compression by using fewer dimensions than input</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Reconstruction:</strong> Learn to recreate input - captures essential features</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Latent Space:</strong> Compressed representation captures data structure</div> | |
| </div> | |
| <h3>Variational Autoencoder (VAE)</h3> | |
| <p>Instead of encoding to a point, encode to a probability distribution (mean + variance).</p> | |
| <div class="formula"> | |
| Encoder outputs: ฮผ (mean) and ฯ (standard deviation)<br> | |
| Sample: z = ฮผ + ฯ ร ฮต (where ฮต ~ N(0,1))<br> | |
| This is the "reparameterization trick" for backprop! | |
| </div> | |
| `, | |
| math: ` | |
| <h3>VAE Loss Function (ELBO)</h3> | |
| <p>VAE maximizes the Evidence Lower Bound:</p> | |
| <div class="formula"> | |
| L = E[log p(x|z)] - KL(q(z|x) || p(z))<br> | |
| <br> | |
| Where:<br> | |
| โข First term: Reconstruction quality<br> | |
| โข Second term: KL divergence regularization (push q toward N(0,1)) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: KL Divergence</div> | |
| For Gaussians:<br> | |
| KL = -0.5 ร ฮฃ(1 + log(ฯยฒ) - ฮผยฒ - ฯยฒ)<br> | |
| This has a closed-form solution - no sampling needed! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐๏ธ Compression</div> | |
| <div class="box-content">Dimensionality reduction, data compression, feature extraction</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Anomaly Detection</div> | |
| <div class="box-content">High reconstruction error = anomaly (fraud detection, defect detection)</div> | |
| </div> | |
| ` | |
| }, | |
| "gans": { | |
| overview: ` | |
| <h3>GANs (Generative Adversarial Networks)</h3> | |
| <p>Two networks compete: Generator creates fake data, Discriminator tries to detect fakes.</p> | |
| <h3>The GAN Game</h3> | |
| <div class="formula"> | |
| Generator: Creates fake images from random noise<br> | |
| Goal: Fool discriminator<br> | |
| <br> | |
| Discriminator: Classifies real vs fake<br> | |
| Goal: Correctly identify fakes<br> | |
| <br> | |
| Minimax Loss:<br> | |
| min_G max_D E[log D(x)] + E[log(1 - D(G(z)))] | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Training Challenges</div> | |
| โข Mode collapse (Generator produces limited variety)<br> | |
| โข Training instability (careful tuning needed)<br> | |
| โข Convergence issues<br> | |
| โข Solutions: Wasserstein GAN, Spectral Normalization, StyleGAN improvements | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐จ Image Generation</div> | |
| <div class="box-content"> | |
| <strong>StyleGAN:</strong> Photorealistic faces, art generation<br> | |
| <strong>DCGAN:</strong> Bedroom images, object generation | |
| </div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Minimax Game Objective</h3> | |
| <p>The original GAN objective from Ian Goodfellow (2014) is a zero-sum game between Discriminator (D) and Generator (G).</p> | |
| <div class="formula" style="font-size: 1.1rem; padding: 20px;"> | |
| min_G max_D V(D, G) = E_xโผp_data[log D(x)] + E_zโผp_z[log(1 - D(G(z)))] | |
| </div> | |
| <h3>Paper & Pain: Finding the Optimal Discriminator</h3> | |
| <p>For a fixed Generator, the optimal Discriminator D* is:</p> | |
| <div class="formula"> | |
| D*(x) = p_data(x) / (p_data(x) + p_g(x)) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Theoretical Insight</div> | |
| When the Discriminator is optimal, the Generator's task is essentially to minimize the <strong>Jensen-Shannon Divergence (JSD)</strong> between the data distribution and the model distribution. <br> | |
| <strong>Problem:</strong> JSD is "flat" when distributions don't overlap, leading to vanishing gradients. This is why <strong>Wasserstein GAN (WGAN)</strong> was inventedโusing Earth Mover's distance instead! | |
| </div> | |
| <h3>Generator Gradient Problem</h3> | |
| <p>Early in training, D(G(z)) is near 0. The term log(1-D(G(z))) has a very small gradient. </p> | |
| <div class="list-item"> | |
| <div class="list-num">๐ก</div> | |
| <div><strong>Heuristic Fix:</strong> Instead of minimizing log(1-D(G(z))), we maximize <strong>log D(G(z))</strong>. This provides much stronger gradients early on!</div> | |
| </div> | |
| ` | |
| }, | |
| "diffusion": { | |
| overview: ` | |
| <h3>Diffusion Models</h3> | |
| <p>Learn to reverse a gradual noising process, generating high-quality images.</p> | |
| <h3>How Diffusion Works</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Forward Process:</strong> Gradually add Gaussian noise over T steps (xโ โ xโ โ ... โ x_T = pure noise)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Reverse Process:</strong> Train neural network to denoise (x_T โ x_{T-1} โ ... โ xโ = clean image)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Generation:</strong> Start from random noise, iteratively denoise T steps</div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ Advantages over GANs</div> | |
| โข More stable training (no adversarial dynamics)<br> | |
| โข Better sample quality and diversity<br> | |
| โข Mode coverage (no mode collapse)<br> | |
| โข Controllable generation (text-to-image) | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Key Components</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>U-Net Backbone:</strong> Encoder-decoder with skip connections predicts noise at each step</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Time Embedding:</strong> Tell the model which timestep it's at (sinusoidal encoding)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>CLIP Conditioning:</strong> Guide generation with text embeddings (Stable Diffusion)</div> | |
| </div> | |
| <h3>Latent Diffusion</h3> | |
| <p>Instead of diffusing in pixel space (expensive), work in VAE latent space (8ร smaller).</p> | |
| <div class="formula"> | |
| Image (512ร512ร3) โ VAE Encoder โ Latent (64ร64ร4) โ Diffuse โ Decode | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Forward Process (Noising)</h3> | |
| <p>Add Gaussian noise according to a schedule ฮฒ_t:</p> | |
| <div class="formula"> | |
| q(x_t | x_{t-1}) = N(x_t; โ(1-ฮฒ_t) ร x_{t-1}, ฮฒ_t ร I)<br> | |
| <br> | |
| Or in closed form for any t:<br> | |
| x_t = โ(แพฑ_t) ร x_0 + โ(1-แพฑ_t) ร ฮต<br> | |
| Where แพฑ_t = ฮ _{s=1}^t (1-ฮฒ_s) | |
| </div> | |
| <h3>Training Objective</h3> | |
| <p>Simple noise prediction loss:</p> | |
| <div class="formula"> | |
| L = E[||ฮต - ฮต_ฮธ(x_t, t)||ยฒ] | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Simplified Loss</div> | |
| The full variational bound is complex, but Ho et al. (2020) showed this simple MSE loss on noise prediction works just as well and is much easier to implement! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ผ๏ธ Text-to-Image</div> | |
| <div class="box-content"> | |
| <strong>Stable Diffusion:</strong> Open-source, runs on consumer GPUs<br> | |
| <strong>DALL-E 2:</strong> OpenAI's photorealistic generator<br> | |
| <strong>Midjourney:</strong> Artistic image generation | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "rnn": { | |
| overview: ` | |
| <h3>The Problem of Long-Term Dependencies</h3> | |
| <p>Imagine trying to predict the last word in the text: "I grew up in France... [200 words] ... I speak fluent <strong>French</strong>."</p> | |
| <p>Standard RNNs fail here. As the gap grows, the gradient from "French" has to travel back 200 steps to "France".</p> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ The Vanishing Gradient Failure</div> | |
| If the weight W < 1, the gradient shrinks exponentially: 0.9ยฒโฐโฐ โ 0.0000000007.<br> | |
| The network literally "forgets" the context. | |
| </div> | |
| <h3>The LSTM Solution: "The Conveyor Belt"</h3> | |
| <p>The core idea of LSTM is the <strong>Cell State (C)</strong>. It runs straight down the entire chain, with only minor linear interactions. It's like a conveyor beltโinformation can flow along it unchanged.</p> | |
| `, | |
| concepts: ` | |
| <h3>Step-by-Step LSTM Walkthrough</h3> | |
| <p>LSTMs control the cell state via "Gates".</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Forget Gate layer:</strong> "What do we throw away?"<br> | |
| <span class="formula-caption">ฯ(W_f ยท [h_{t-1}, x_t] + b_f)</span><br> | |
| Output 0 = complete forget, 1 = keep everything.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Input Gate layer:</strong> "What new info do we add?"<br> | |
| 1. Sigmoid layer decides <strong>which values</strong> to update.<br> | |
| 2. Tanh layer creates <strong>new candidate values</strong> (~C_t).</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Update Cell State:</strong> The Critical Step<br> | |
| <span class="formula" style="font-size:1.1rem">C_t = f_t * C_{t-1} + i_t * ~C_t</span><br> | |
| We multiply old state by f_t (forgetting things) and add i_t * ~C_t (adding new things).</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Output Gate:</strong> "What do we reveal?"<br> | |
| We filter the cell state: h_t = o_t * tanh(C_t).</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก Intuition: Sigmoid vs Tanh</div> | |
| โข <strong>Sigmoid (0 to 1):</strong> Acts like a valve or gate. Open/Close.<br> | |
| โข <strong>Tanh (-1 to 1):</strong> Creates content/information. Normalized centered data. | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Paper & Pain: Why LSTMs Don't Vanish</h3> | |
| <p>Let's look at the gradient flow in the Cell State equation:</p> | |
| <div class="formula"> | |
| C_t = f_t \cdot C_{t-1} + i_t \cdot \tilde{C}_t | |
| </div> | |
| <p>During backpropagation (BPTT), the derivative of C_t with respect to C_{t-1} is:</p> | |
| <div class="formula" style="color: #00ff88;"> | |
| \frac{\partial C_t}{\partial C_{t-1}} = f_t + \dots | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">โ The Additive Gradient Highway</div> | |
| In standard RNNs, we had multiplicative gradients (W^t). <br> | |
| In LSTMs, the gradient is <strong>additive</strong>. If the Forget Gate f_t โ 1, the gradient passes through UNCHANGED (1.0).<br> | |
| The error signal can travel back 1000 steps without vanishing! | |
| </div> | |
| <h3>Paper & Pain: Manual "Echo" Task</h3> | |
| <p>Task: Input stream of 0s. If explicit "1" appears, remember it and output "1" 3 steps later.</p> | |
| <div class="list-item"> | |
| <div class="list-num">๐</div> | |
| <div><strong>Strategy:</strong> Set Input Gate to Open (1) only on trigger. Set Forget Gate to Closed (1) to maintain memory.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">๐งฎ</div> | |
| <div><strong>Weights:</strong> We can manually solve for weights that force i_t high only when x_t=1.</div> | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฃ๏ธ Sequence-to-Sequence</div> | |
| <div class="box-content"> | |
| <strong>Translation:</strong> Google Translate (pre-Transformer) used massive LSTM stacks (GNMT). | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">โ๏ธ Handwriting Generation</div> | |
| <div class="box-content"> | |
| <strong>Alex Graves (2013):</strong> LSTMs can generate realistic cursive handwriting by predicting pen coordinates. | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ต Music Composition</div> | |
| <div class="box-content"> | |
| Generating melody and chords where context (key signature, tempo) must be maintained for minutes. | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "bert": { | |
| overview: ` | |
| <h3>BERT: Bidirectional Encoder Representations from Transformers</h3> | |
| <p><strong>Paper:</strong> "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" (Devlin et al., 2018)</p> | |
| <p><strong>arXiv:</strong> <a href="https://arxiv.org/abs/1810.04805" target="_blank">1810.04805</a></p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ฏ Key Innovation</div> | |
| BERT revolutionized NLP by introducing <strong>bidirectional pre-training</strong>. Unlike previous models (GPT, ELMo) that processed text left-to-right or combined shallow bidirectional representations, BERT deeply integrates left AND right context in all layers simultaneously. | |
| </div> | |
| <h3>Why BERT Matters</h3> | |
| <ul> | |
| <li><strong>Transfer Learning for NLP:</strong> Pre-train once on massive unlabeled data, fine-tune on many tasks</li> | |
| <li><strong>State-of-the-Art Results:</strong> Set new records on 11 NLP tasks including SQuAD, GLUE, and SWAG</li> | |
| <li><strong>Efficiency:</strong> Fine-tuning requires minimal task-specific architecture changes</li> | |
| <li><strong>Accessibility:</strong> Google released pre-trained models publicly</li> | |
| </ul> | |
| <h3>Pre-training Corpus</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Training Data</div> | |
| <div class="box-content"> | |
| <strong>BooksCorpus:</strong> 800M words from 11,038 unpublished books<br> | |
| <strong>English Wikipedia:</strong> 2,500M words (text passages only, no lists/tables/headers)<br> | |
| <strong>Total:</strong> ~3.3 billion words | |
| </div> | |
| </div> | |
| <h3>Pre-training Tasks</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Masked Language Modeling (MLM):</strong> Randomly mask 15% of tokens and predict them using bidirectional context. Example: "The cat [MASK] on the mat" โ predict "sat"</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Next Sentence Prediction (NSP):</strong> Given sentence pairs (A, B), predict if B actually follows A in the corpus. Helps with tasks like QA and NLI that require understanding sentence relationships.</div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก The BERT Fine-tuning Paradigm</div> | |
| 1. <strong>Pre-train</strong> BERT on BooksCorpus + Wikipedia (days/weeks on TPUs)<br> | |
| 2. <strong>Download</strong> pre-trained weights from Google<br> | |
| 3. <strong>Add</strong> task-specific head (1 layer for classification/QA/NER)<br> | |
| 4. <strong>Fine-tune</strong> entire model on your dataset (hours on single GPU)<br> | |
| 5. <strong>Achieve SOTA</strong> with as few as 3,600 labeled examples! | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>BERT Architecture</h3> | |
| <p>BERT uses a multi-layer bidirectional Transformer encoder based on Vaswani et al. (2017).</p> | |
| <h3>Model Variants</h3> | |
| <table> | |
| <tr> | |
| <th>Model</th> | |
| <th>Layers (L)</th> | |
| <th>Hidden Size (H)</th> | |
| <th>Attention Heads (A)</th> | |
| <th>Parameters</th> | |
| </tr> | |
| <tr> | |
| <td>BERT<sub>BASE</sub></td> | |
| <td>12</td> | |
| <td>768</td> | |
| <td>12</td> | |
| <td>110M</td> | |
| </tr> | |
| <tr> | |
| <td>BERT<sub>LARGE</sub></td> | |
| <td>24</td> | |
| <td>1024</td> | |
| <td>16</td> | |
| <td>340M</td> | |
| </tr> | |
| </table> | |
| <p><em>Note: BERT<sub>BASE</sub> was designed to match GPT's size for comparison.</em></p> | |
| <h3>Input Representation</h3> | |
| <p>BERT's input embedding is the sum of three components:</p> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Token Embeddings:</strong> WordPiece tokenization with 30,000 token vocabulary. Handles unknown words by splitting into subwords (e.g., "playing" โ "play" + "##ing")</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Segment Embeddings:</strong> Learned embedding to distinguish sentence A from sentence B (E<sub>A</sub> or E<sub>B</sub>)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Position Embeddings:</strong> Learned positional encodings (unlike Transformers' sinusoidal), supports sequences up to 512 tokens</div> | |
| </div> | |
| <div class="formula"> | |
| Input = Token_Embedding + Segment_Embedding + Position_Embedding | |
| </div> | |
| <h3>Special Tokens</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ท๏ธ Special Token Usage</div> | |
| <div class="box-content"> | |
| <strong>[CLS]:</strong> Prepended to every input. Final hidden state used for classification tasks<br> | |
| <strong>[SEP]:</strong> Separates sentence pairs and marks sequence end<br> | |
| <strong>[MASK]:</strong> Replaces masked tokens during pre-training (not used during fine-tuning) | |
| </div> | |
| </div> | |
| <h4>Example Input Format</h4> | |
| <div class="formula"> | |
| [CLS] My dog is cute [SEP] He likes playing [SEP]<br> | |
| <br> | |
| Tokens: [CLS] My dog is cute [SEP] He likes play ##ing [SEP]<br> | |
| Segments: E_A E_A E_A E_A E_A E_A E_B E_B E_B E_B E_B<br> | |
| Positions: 0 1 2 3 4 5 6 7 8 9 10 | |
| </div> | |
| <h3>Fine-tuning for Different Tasks</h3> | |
| <table> | |
| <tr><th>Task Type</th><th>Input Format</th><th>Output</th></tr> | |
| <tr> | |
| <td>Classification</td> | |
| <td>[CLS] text [SEP]</td> | |
| <td>[CLS] representation โ classifier</td> | |
| </tr> | |
| <tr> | |
| <td>Sentence Pair</td> | |
| <td>[CLS] sent A [SEP] sent B [SEP]</td> | |
| <td>[CLS] representation โ classifier</td> | |
| </tr> | |
| <tr> | |
| <td>Question Answering</td> | |
| <td>[CLS] question [SEP] passage [SEP]</td> | |
| <td>Start/End span vectors over passage tokens</td> | |
| </tr> | |
| <tr> | |
| <td>Token Classification</td> | |
| <td>[CLS] text [SEP]</td> | |
| <td>Each token representation โ label</td> | |
| </tr> | |
| </table> | |
| `, | |
| math: ` | |
| <h3>Pre-training Objective</h3> | |
| <p>BERT simultaneously optimizes two unsupervised tasks:</p> | |
| <div class="formula"> | |
| L = L<sub>MLM</sub> + L<sub>NSP</sub> | |
| </div> | |
| <h3>Masked Language Modeling (MLM)</h3> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: The Masking Strategy</div> | |
| <strong>Problem:</strong> Standard left-to-right language modeling can't capture bidirectional context.<br> | |
| <strong>Solution:</strong> Randomly mask 15% of tokens and predict them using full context.<br><br> | |
| <strong>However:</strong> [MASK] token doesn't appear during fine-tuning!<br> | |
| <strong>Clever Fix:</strong> Of the 15% selected tokens:<br> | |
| โข 80% โ Replace with [MASK]<br> | |
| โข 10% โ Replace with random token<br> | |
| โข 10% โ Keep unchanged<br><br> | |
| This forces the model to maintain context representations for ALL tokens! | |
| </div> | |
| <h4>MLM Loss Derivation</h4> | |
| <p>Let's work through the MLM objective step by step:</p> | |
| <div class="formula"> | |
| Given input sequence: x = [xโ, xโ, ..., x_n]<br> | |
| Masked sequence: xฬ = [xฬโ, xฬโ, ..., xฬ_n]<br> | |
| <br> | |
| Let M = {iโ, iโ, ..., i_m} be indices of masked tokens<br> | |
| <br> | |
| For each masked position i โ M:<br> | |
| h_i = BERT(xฬ)_i (hidden state at position i)<br> | |
| logits_i = W ยท h_i + b (W โ โ^(VรH), vocab size V)<br> | |
| P(x_i | xฬ) = softmax(logits_i)<br> | |
| <br> | |
| Cross-entropy loss per token:<br> | |
| L_i = -log P(x_i | xฬ)<br> | |
| <br> | |
| Total MLM loss:<br> | |
| L_MLM = (1/|M|) ฮฃ_{iโM} L_i<br> | |
| L_MLM = -(1/|M|) ฮฃ_{iโM} log P(x_i | xฬ) | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">๐ Worked Example: MLM Calculation</div> | |
| <strong>Input:</strong> "The cat sat on the mat"<br> | |
| <strong>After masking (15%):</strong> "The [MASK] sat on the mat"<br> | |
| <strong>Target:</strong> Predict "cat" at position 2<br><br> | |
| <strong>Step 1:</strong> Forward pass through BERT<br> | |
| hโ = BERT(xฬ)โ โ โ^768 (for BERT_BASE)<br><br> | |
| <strong>Step 2:</strong> Project to vocabulary space<br> | |
| logitsโ = W ยท hโ + b โ โ^30000<br><br> | |
| <strong>Step 3:</strong> Compute probabilities<br> | |
| P(w | xฬ) = exp(logitsโ[w]) / ฮฃ_v exp(logitsโ[v])<br><br> | |
| <strong>Step 4:</strong> Compute loss (assume P("cat"|xฬ) = 0.73)<br> | |
| L = -log(0.73) = 0.315 | |
| </div> | |
| <h3>Next Sentence Prediction (NSP)</h3> | |
| <p>Binary classification task to understand sentence relationships.</p> | |
| <div class="formula"> | |
| Input: [CLS] sentence_A [SEP] sentence_B [SEP]<br> | |
| <br> | |
| Let C = final hidden state of [CLS] token โ โ^H<br> | |
| <br> | |
| P(IsNext = True) = ฯ(W_NSP ยท C)<br> | |
| where ฯ = sigmoid function, W_NSP โ โ^(1รH)<br> | |
| <br> | |
| Binary cross-entropy loss:<br> | |
| L_NSP = -[yยทlog(ลท) + (1-y)ยทlog(1-ลท)]<br> | |
| where y = 1 if B follows A, else 0 | |
| </div> | |
| <h4>NSP Training Data Generation</h4> | |
| <ul> | |
| <li><strong>50% IsNext:</strong> B actually follows A in corpus</li> | |
| <li><strong>50% NotNext:</strong> B sampled randomly from another document</li> | |
| </ul> | |
| <h3>Fine-tuning Math: Question Answering (SQuAD)</h3> | |
| <div class="formula"> | |
| Input: [CLS] question [SEP] paragraph [SEP]<br> | |
| <br> | |
| Let T_i = final hidden state for token i in paragraph<br> | |
| <br> | |
| Start position logits: S_i = W_start ยท T_i<br> | |
| End position logits: E_i = W_end ยท T_i<br> | |
| <br> | |
| P(start = i) = softmax(S)_i<br> | |
| P(end = j) = softmax(E)_j<br> | |
| <br> | |
| Answer span = tokens from position i to j<br> | |
| <br> | |
| Training loss:<br> | |
| L = -log P(start = i*) - log P(end = j*)<br> | |
| where i*, j* are ground truth positions | |
| </div> | |
| `, | |
| applications: ` | |
| <h3>SQuAD Benchmark Performance</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Stanford Question Answering Dataset (SQuAD)</div> | |
| <div class="box-content"> | |
| <strong>SQuAD 1.1:</strong> 100,000+ question-answer pairs on 500+ Wikipedia articles. Every question has an answer span in the passage.<br><br> | |
| <strong>SQuAD 2.0:</strong> Adds 50,000+ unanswerable questions. Models must determine when no answer exists.<br><br> | |
| <strong>Evaluation Metrics:</strong><br> | |
| โข <strong>EM (Exact Match):</strong> % of predictions matching ground truth exactly<br> | |
| โข <strong>F1:</strong> Token-level overlap between prediction and ground truth | |
| </div> | |
| </div> | |
| <h3>SQuAD 1.1 Results</h3> | |
| <table> | |
| <tr><th>Model</th><th>EM</th><th>F1</th></tr> | |
| <tr><td>Human Performance</td><td>82.3</td><td>91.2</td></tr> | |
| <tr><td>BERT<sub>BASE</sub></td><td>80.8</td><td>88.5</td></tr> | |
| <tr><td>BERT<sub>LARGE</sub></td><td><strong>84.1</strong></td><td><strong>90.9</strong></td></tr> | |
| </table> | |
| <p><em>BERT<sub>LARGE</sub> surpassed human performance on EM!</em></p> | |
| <h3>SQuAD 2.0 Results</h3> | |
| <table> | |
| <tr><th>Model</th><th>EM</th><th>F1</th></tr> | |
| <tr><td>Human Performance</td><td>86.9</td><td>89.5</td></tr> | |
| <tr><td>BERT<sub>BASE</sub></td><td>73.7</td><td>76.3</td></tr> | |
| <tr><td>BERT<sub>LARGE</sub></td><td><strong>78.7</strong></td><td><strong>81.9</strong></td></tr> | |
| </table> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Example SQuAD Question</div> | |
| <strong>Passage:</strong> "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France."<br><br> | |
| <strong>Question:</strong> "In what country is Normandy located?"<br><br> | |
| <strong>BERT Answer:</strong> "France" โ<br> | |
| <strong>Start Token:</strong> position 32<br> | |
| <strong>End Token:</strong> position 32 | |
| </div> | |
| <h3>GLUE Benchmark (General Language Understanding Evaluation)</h3> | |
| <p>BERT set new state-of-the-art on all 9 GLUE tasks:</p> | |
| <table> | |
| <tr><th>Task</th><th>Metric</th><th>Previous SOTA</th><th>BERT<sub>LARGE</sub></th></tr> | |
| <tr><td>MNLI (NLI)</td><td>Acc</td><td>86.6</td><td><strong>86.7</strong></td></tr> | |
| <tr><td>QQP (Paraphrase)</td><td>F1</td><td>66.1</td><td><strong>72.1</strong></td></tr> | |
| <tr><td>QNLI (QA/NLI)</td><td>Acc</td><td>87.4</td><td><strong>92.7</strong></td></tr> | |
| <tr><td>SST-2 (Sentiment)</td><td>Acc</td><td>93.2</td><td><strong>94.9</strong></td></tr> | |
| <tr><td>CoLA (Acceptability)</td><td>Matthew's</td><td>35.0</td><td><strong>60.5</strong></td></tr> | |
| </table> | |
| <h3>Additional Applications</h3> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Google Search</div> | |
| <div class="box-content"> | |
| In October 2019, Google began using BERT for 1 in 10 English search queries, calling it the biggest leap in 5 years. BERT helps understand search intent and context. | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ท๏ธ Named Entity Recognition (NER)</div> | |
| <div class="box-content"> | |
| BERT excels at identifying entities (person, location, organization) in text by treating it as token classification. Each token gets a label (B-PER, I-PER, B-LOC, etc.). | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Text Classification</div> | |
| <div class="box-content"> | |
| Sentiment analysis, topic classification, spam detection - all benefit from BERT's contextual understanding. Simply use [CLS] representation with a classifier. | |
| </div> | |
| </div> | |
| <h3>Using BERT: Quick Code Example</h3> | |
| <div class="formula"> | |
| # Using Hugging Face Transformers<br> | |
| from transformers import BertTokenizer, BertForQuestionAnswering<br> | |
| import torch<br> | |
| <br> | |
| # Load pre-trained model and tokenizer<br> | |
| tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')<br> | |
| model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')<br> | |
| <br> | |
| # Example<br> | |
| question = "What is BERT?"<br> | |
| context = "BERT is a bidirectional Transformer for NLP."<br> | |
| <br> | |
| # Tokenize and get answer<br> | |
| inputs = tokenizer(question, context, return_tensors='pt')<br> | |
| outputs = model(**inputs)<br> | |
| <br> | |
| start_idx = torch.argmax(outputs.start_logits)<br> | |
| end_idx = torch.argmax(outputs.end_logits)<br> | |
| answer = tokenizer.convert_tokens_to_string(<br> | |
| tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx+1])<br> | |
| )<br> | |
| print(answer) # "a bidirectional Transformer for NLP" | |
| </div> | |
| ` | |
| }, | |
| "gpt": { | |
| overview: ` | |
| <h3>GPT (Generative Pre-trained Transformer)</h3> | |
| <p>Decoder-only Transformer trained to predict next token (autoregressive language modeling).</p> | |
| <h3>GPT Evolution</h3> | |
| <table> | |
| <tr> | |
| <th>Model</th> | |
| <th>Params</th> | |
| <th>Training Data</th> | |
| <th>Capability</th> | |
| </tr> | |
| <tr> | |
| <td>GPT-1</td> | |
| <td>117M</td> | |
| <td>BooksCorpus</td> | |
| <td>Basic text generation</td> | |
| </tr> | |
| <tr> | |
| <td>GPT-2</td> | |
| <td>1.5B</td> | |
| <td>WebText (40GB)</td> | |
| <td>Coherent paragraphs</td> | |
| </tr> | |
| <tr> | |
| <td>GPT-3</td> | |
| <td>175B</td> | |
| <td>570GB text</td> | |
| <td>Few-shot learning</td> | |
| </tr> | |
| <tr> | |
| <td>GPT-4</td> | |
| <td>~1.8T</td> | |
| <td>Multi-modal</td> | |
| <td>Reasoning, coding, images</td> | |
| </tr> | |
| </table> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Emergent Abilities</div> | |
| As models scale, new capabilities emerge:<br> | |
| โข In-context learning (learn from prompts)<br> | |
| โข Chain-of-thought reasoning<br> | |
| โข Code generation<br> | |
| โข Multi-step problem solving | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>GPT Architecture</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Decoder Only:</strong> Uses causal (masked) attention - can only see past tokens</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Autoregressive:</strong> Generate one token at a time, feed back as input</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Pre-training:</strong> Next token prediction on massive text corpus</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>RLHF:</strong> Reinforcement Learning from Human Feedback (ChatGPT)</div> | |
| </div> | |
| <h3>In-Context Learning</h3> | |
| <p>GPT-3+ can learn from examples in the prompt without updating weights!</p> | |
| <div class="formula"> | |
| Zero-shot: "Translate to French: Hello" โ "Bonjour"<br> | |
| Few-shot: "catโchat, dogโchien, houseโ?" โ "maison" | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Causal Language Modeling</h3> | |
| <p>GPT is trained to maximize the likelihood of the next token:</p> | |
| <div class="formula"> | |
| L = -ฮฃ log P(x_t | x_{<t})<br> | |
| <br> | |
| Where P(x_t | x_{<t}) = softmax(h_t ร W_vocab) | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Scaling Laws</div> | |
| Performance scales predictably with compute, data, and parameters:<br> | |
| L โ N^(-0.076) for model size N<br> | |
| This is why OpenAI trained GPT-3 (175B) and GPT-4 (1.8T)! | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ ChatGPT & Assistants</div> | |
| <div class="box-content"> | |
| Conversational AI, customer support, tutoring, brainstorming | |
| </div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ป Code Generation</div> | |
| <div class="box-content"> | |
| GitHub Copilot, code completion, bug fixing, documentation | |
| </div> | |
| </div> | |
| ` | |
| }, | |
| "vit": { | |
| overview: ` | |
| <h3>Vision Transformer (ViT)</h3> | |
| <p>Apply Transformer architecture directly to images by treating them as sequences of patches.</p> | |
| <h3>How ViT Works</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Patchify:</strong> Split 224ร224 image into 16ร16 patches (14ร14 = 196 patches)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Linear Projection:</strong> Flatten each patch โ linear embedding (like word embeddings)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Positional Encoding:</strong> Add position information</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">04</div> | |
| <div><strong>Transformer Encoder:</strong> Standard Transformer (self-attention, FFN)</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">05</div> | |
| <div><strong>Classification:</strong> Use [CLS] token for final prediction</div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก When ViT Shines</div> | |
| โข <strong>Large Datasets:</strong> Needs 10M+ images (or pre-training on ImageNet-21K)<br> | |
| โข <strong>Transfer Learning:</strong> Pre-trained ViT beats CNNs on many tasks<br> | |
| โข <strong>Long-Range Dependencies:</strong> Global attention vs CNN's local receptive field | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>ViT vs CNN Comparison</h3> | |
| <table> | |
| <tr><th>Aspect</th><th>CNN</th><th>ViT</th></tr> | |
| <tr><td>Inductive Bias</td><td>Locality, translation invariance</td><td>Minimal - learns from data</td></tr> | |
| <tr><td>Data Efficiency</td><td>Better with small datasets</td><td>Needs large datasets</td></tr> | |
| <tr><td>Receptive Field</td><td>Local (grows with depth)</td><td>Global from layer 1</td></tr> | |
| <tr><td>Scalability</td><td>Diminishing returns</td><td>Scales well with compute</td></tr> | |
| </table> | |
| <h3>Key Innovations</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>No Convolutions:</strong> Pure attention - "An Image is Worth 16x16 Words"</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Learnable Position:</strong> Position embeddings are learned, not sinusoidal</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Patch Embedding</h3> | |
| <p>Convert image patches to token embeddings:</p> | |
| <div class="formula"> | |
| z_0 = [x_cls; x_p^1 E; x_p^2 E; ...; x_p^N E] + E_pos<br> | |
| <br> | |
| Where:<br> | |
| โข x_p^i = flattened patch (16ร16ร3 = 768 dimensions)<br> | |
| โข E = learnable linear projection<br> | |
| โข E_pos = position embedding | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ Paper & Pain: Computation</div> | |
| ViT-Base: 12 layers, 768 hidden, 12 heads ~ 86M params<br> | |
| Self-attention cost: O(nยฒยทd) where n=196 patches<br> | |
| This is why ViT is efficient for images (196 tokens) vs text (1000+ tokens) | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ผ๏ธ Image Classification</div> | |
| <div class="box-content">SOTA on ImageNet with pre-training. Google/DeepMind use for internal systems.</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Object Detection</div> | |
| <div class="box-content">DETR, DINO - Transformer-based detection replacing Faster R-CNN</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ฌ Video Understanding</div> | |
| <div class="box-content">VideoViT, TimeSformer - extend patches to 3D (space + time)</div> | |
| </div> | |
| ` | |
| }, | |
| "seq2seq": { | |
| overview: ` | |
| <h3>Seq2Seq with Attention</h3> | |
| <p>The architecture that revolutionized Machine Translation (before Transformers).</p> | |
| <h3>The Bottleneck Problem</h3> | |
| <p>Standard Encoder-Decoder models try to compress the entire sentence "I love deep learning" into a single vector (Context Vector).</p> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Information Bottleneck</div> | |
| For long sentences (e.g., 50 words), the fixed-size vector forgets early details. Performance degrades rapidly with length. | |
| </div> | |
| <h3>The Attention Solution</h3> | |
| <p><strong>Idea:</strong> Don't just look at the last state. Let the Decoder "look back" at ALL Encoder states at every step.</p> | |
| `, | |
| concepts: ` | |
| <h3>Visualizing Attention (Alammar Style)</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Encoder States:</strong> We keep all hidden states $h_1, h_2, h_3, h_4$.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Alignment Scores:</strong> Decoder asks: "How relevant is $h_i$ to what I'm translating now?"</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Context Vector:</strong> Weighted sum of relevant states. If attention is strong on $h_2$, the context vector looks a lot like $h_2$.</div> | |
| </div> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก The "Searchlight" Analogy</div> | |
| Attention is like a searchlight. When generating "รฉtudiant" (student), the light shines brightly on "student" in the input sentence. | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Attention Math (Bahdanau / Luong)</h3> | |
| <p>Let Decoder state be $s_{t-1}$ and Encoder states be $h_j$.</p> | |
| <div class="formula"> | |
| 1. Score: $e_{tj} = score(s_{t-1}, h_j)$ (e.g., Dot Product)<br> | |
| 2. Weights: $\alpha_{tj} = softmax(e_{tj})$<br> | |
| 3. Context: $c_t = \sum \alpha_{tj} h_j$<br> | |
| 4. Output: $s_t = RNN(s_{t-1}, [y_{t-1}, c_t])$ | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ Paper & Pain: Dot Product vs Additive</div> | |
| โข <strong>Dot Product (Luong):</strong> $s^T h$ (Fast, matrix mult)<br> | |
| โข <strong>Additive (Bahdanau):</strong> $v^T tanh(W_1 s + W_2 h)$ (More parameters, originally better for large dim) | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ฃ๏ธ Neural Machine Translation</div> | |
| <div class="box-content">Google Translate (2016) switched to GNMT (Attention-based), reducing errors by 60%.</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Text Summarization</div> | |
| <div class="box-content">Focusing on key sentences in a long document to generate a headline.</div> | |
| </div> | |
| ` | |
| }, | |
| "research-papers": { | |
| overview: ` | |
| <h3>Seminal Papers Library</h3> | |
| <p>A curated collection of the most impactful papers in Deep Learning history, with "Paper & Pain" insights.</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ How to Read Papers</div> | |
| Don't just read the abstract. Look for the <strong>Objective Function</strong> and the <strong>Architecture Diagram</strong>. That's where the truth lies. | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Computer Vision Hall of Fame</h3> | |
| <div class="list-item"> | |
| <div class="list-num">2012</div> | |
| <div><strong>AlexNet</strong> (Krizhevsky et al.)<br> | |
| <em>"ImageNet Classification with Deep Convolutional Neural Networks"</em><br> | |
| <span class="formula-caption">Insight: Relied on TWO GPUs because 3GB RAM wasn't enough. The split architecture was distinct.</span><br> | |
| <a href="https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf" target="_blank" style="color: #ff6b35;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2014</div> | |
| <div><strong>VGGNet</strong> (Simonyan & Zisserman)<br> | |
| <em>"Very Deep Convolutional Networks for Large-Scale Image Recognition"</em><br> | |
| <span class="formula-caption">Insight: 3x3 filters are all you need. Two 3x3 layers have the same receptive field as one 5x5 but fewer parameters.</span><br> | |
| <a href="https://arxiv.org/pdf/1409.1556.pdf" target="_blank" style="color: #ff6b35;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2015</div> | |
| <div><strong>U-Net</strong> (Ronneberger et al.)<br> | |
| <em>"Convolutional Networks for Biomedical Image Segmentation"</em><br> | |
| <span class="formula-caption">Insight: Skip connections concatenating features from encoder to decoder allow precise localization.</span><br> | |
| <a href="https://arxiv.org/pdf/1505.04597.pdf" target="_blank" style="color: #ff6b35;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2015</div> | |
| <div><strong>ResNet</strong> (He et al.)<br> | |
| <em>"Deep Residual Learning for Image Recognition"</em><br> | |
| <span class="formula-caption">Insight: It's easier to learn 0 than Identity. $f(x) = H(x) - x$.</span><br> | |
| <a href="https://arxiv.org/pdf/1512.03385.pdf" target="_blank" style="color: #ff6b35;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2016</div> | |
| <div><strong>YOLO</strong> (Redmon et al.)<br> | |
| <em>"You Only Look Once: Unified, Real-Time Object Detection"</em><br> | |
| <span class="formula-caption">Insight: Treated detection as a <strong>regression</strong> problem, not classification. Single forward pass.</span><br> | |
| <a href="https://arxiv.org/pdf/1506.02640.pdf" target="_blank" style="color: #ff6b35;">๐ Read PDF</a></div> | |
| </div> | |
| <hr style="border-color: #333; margin: 20px 0;"> | |
| <h3>NLP & GenAI Hall of Fame</h3> | |
| <div class="list-item"> | |
| <div class="list-num">2014</div> | |
| <div><strong>GANs</strong> (Goodfellow et al.)<br> | |
| <em>"Generative Adversarial Networks"</em><br> | |
| <span class="formula-caption">Insight: Training a generator by fighting a discriminator. The minimax game: $\min_G \max_D V(D, G)$.</span><br> | |
| <a href="https://arxiv.org/pdf/1406.2661.pdf" target="_blank" style="color: #a371f7;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2017</div> | |
| <div><strong>Attention Is All You Need</strong> (Vaswani et al.)<br> | |
| <em>"Transformers"</em><br> | |
| <span class="formula-caption">Insight: Sinusoidal Positional Embeddings allow the model to generalize to lengths unseen during training.</span><br> | |
| <a href="https://arxiv.org/pdf/1706.03762.pdf" target="_blank" style="color: #00d4ff;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2018</div> | |
| <div><strong>BERT</strong> (Devlin et al.)<br> | |
| <em>"Pre-training of Deep Bidirectional Transformers"</em><br> | |
| <span class="formula-caption">Insight: Masked LM (Cloze task) is inefficient (only 15% signal) but crucial for bidirectionality.</span><br> | |
| <a href="https://arxiv.org/pdf/1810.04805.pdf" target="_blank" style="color: #00d4ff;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2020</div> | |
| <div><strong>GPT-3</strong> (Brown et al.)<br> | |
| <em>"Language Models are Few-Shot Learners"</em><br> | |
| <span class="formula-caption">Insight: Scale is all you need. 175B parameters enable emergent behavior like in-context learning.</span><br> | |
| <a href="https://arxiv.org/pdf/2005.14165.pdf" target="_blank" style="color: #00d4ff;">๐ Read PDF</a></div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2020</div> | |
| <div><strong>DDPM</strong> (Ho et al.)<br> | |
| <em>"Denoising Diffusion Probabilistic Models"</em><br> | |
| <span class="formula-caption">Insight: Predicting the noise $\epsilon$ is mathematically equivalent to predicting the score function (gradient of data density).</span><br> | |
| <a href="https://arxiv.org/pdf/2006.11239.pdf" target="_blank" style="color: #a371f7;">๐ Read PDF</a></div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>The Formulas That Changed AI</h3> | |
| <p><strong>ResNet Residual:</strong></p> | |
| <div class="formula">y = F(x, \{W_i\}) + x</div> | |
| <p><strong>Scaled Dot-Product Attention:</strong></p> | |
| <div class="formula">Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V</div> | |
| <p><strong>Diffusion Reverse Process:</strong></p> | |
| <div class="formula">p_\theta(x_{t-1}|x_t) = \mathcal{N}(x_{t-1}; \mu_\theta(x_t, t), \Sigma_\theta(x_t, t))</div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Impact</div> | |
| <div class="box-content">These papers form the foundation of ChatGPT, Midjourney, Self-Driving Cars, and Facial Recognition.</div> | |
| </div> | |
| ` | |
| }, | |
| "gnn": { | |
| overview: ` | |
| <h3>Graph Neural Networks (GNNs)</h3> | |
| <p>Deep learning on non-Euclidean data structures like social networks, molecules, and knowledge graphs.</p> | |
| <h3>Key Concepts</h3> | |
| <div class="list-item"> | |
| <div class="list-num">01</div> | |
| <div><strong>Graph Structure:</strong> Nodes (entities) and Edges (relationships).</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">02</div> | |
| <div><strong>Message Passing:</strong> Nodes exchange information with neighbors.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">03</div> | |
| <div><strong>Aggregation:</strong> Combine incoming messages (Sum, Mean, Max).</div> | |
| </div> | |
| <div class="callout tip"> | |
| <div class="callout-title">๐ก Why GNNs?</div> | |
| Standard CNNs expect a fixed grid (euclidean). Graphs have arbitrary size and topology. GNNs are permutation invariant! | |
| </div> | |
| `, | |
| concepts: ` | |
| <h3>Message Passing Neural Networks (MPNN)</h3> | |
| <p>The core framework for most GNNs.</p> | |
| <div class="list-item"> | |
| <div class="list-num">1</div> | |
| <div><strong>Message Function:</strong> Compute message from neighbor to node.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">2</div> | |
| <div><strong>Aggregation Function:</strong> Sum all messages from neighbors.</div> | |
| </div> | |
| <div class="list-item"> | |
| <div class="list-num">3</div> | |
| <div><strong>Update Function:</strong> Update node state based on aggregated messages.</div> | |
| </div> | |
| `, | |
| math: ` | |
| <h3>Graph Convolution Network (GCN)</h3> | |
| <p>The "Hello World" of GNNs (Kipf & Welling, 2017).</p> | |
| <div class="formula"> | |
| H^{(l+1)} = ฯ(D^{-1/2} A D^{-1/2} H^{(l)} W^{(l)}) | |
| </div> | |
| <p>Where:</p> | |
| <ul> | |
| <li><strong>A:</strong> Adjacency Matrix (connections)</li> | |
| <li><strong>D:</strong> Degree Matrix (number of connections)</li> | |
| <li><strong>H:</strong> Node Features</li> | |
| <li><strong>W:</strong> Learnable Weights</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">โ ๏ธ Over-smoothing</div> | |
| If GNN is too deep, all node representations become indistinguishable. Usually 2-4 layers are enough. | |
| </div> | |
| `, | |
| applications: ` | |
| <div class="info-box"> | |
| <div class="box-title">๐ Drug Discovery</div> | |
| <div class="box-content">Predicting molecular properties, protein folding (AlphaFold)</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Traffic Prediction</div> | |
| <div class="box-content">Road networks, estimating travel times (Google Maps)</div> | |
| </div> | |
| <div class="info-box"> | |
| <div class="box-title">๐ Recommender Systems</div> | |
| <div class="box-content">Pinterest (PinSage), User-Item graphs</div> | |
| </div> | |
| ` | |
| } | |
| }; | |
| function createModuleHTML(module) { | |
| const content = MODULE_CONTENT[module.id] || {}; | |
| return ` | |
| <div class="module" id="${module.id}-module"> | |
| <button class="btn-back" onclick="switchTo('dashboard')">โ Back to Dashboard</button> | |
| <header> | |
| <h1>${module.icon} ${module.title}</h1> | |
| <p class="subtitle">${module.description}</p> | |
| </header> | |
| <div class="tabs"> | |
| <button class="tab-btn active" onclick="switchTab(event, '${module.id}-overview')">Overview</button> | |
| <button class="tab-btn" onclick="switchTab(event, '${module.id}-concepts')">Key Concepts</button> | |
| <button class="tab-btn" onclick="switchTab(event, '${module.id}-visualization')">๐ Visualization</button> | |
| <button class="tab-btn" onclick="switchTab(event, '${module.id}-math')">Math</button> | |
| <button class="tab-btn" onclick="switchTab(event, '${module.id}-applications')">Applications</button> | |
| <button class="tab-btn" onclick="switchTab(event, '${module.id}-summary')">Summary</button> | |
| </div> | |
| <div id="${module.id}-overview" class="tab active"> | |
| <div class="section"> | |
| <h2>๐ Overview</h2> | |
| ${content.overview || ` | |
| <p>Complete coverage of ${module.title.toLowerCase()}. Learn the fundamentals, mathematics, real-world applications, and implementation details.</p> | |
| <div class="info-box"> | |
| <div class="box-title">Learning Objectives</div> | |
| <div class="box-content"> | |
| โ Understand core concepts and theory<br> | |
| โ Master mathematical foundations<br> | |
| โ Learn practical applications<br> | |
| โ Implement and experiment | |
| </div> | |
| </div> | |
| `} | |
| </div> | |
| </div> | |
| <div id="${module.id}-concepts" class="tab"> | |
| <div class="section"> | |
| <h2>๐ฏ Key Concepts</h2> | |
| ${content.concepts || ` | |
| <p>Fundamental concepts and building blocks for ${module.title.toLowerCase()}.</p> | |
| <div class="callout insight"> | |
| <div class="callout-title">๐ก Main Ideas</div> | |
| This section covers the core ideas you need to understand before diving into mathematics. | |
| </div> | |
| `} | |
| </div> | |
| </div> | |
| <div id="${module.id}-visualization" class="tab"> | |
| <div class="section"> | |
| <h2>๐ Interactive Visualization</h2> | |
| <p>Visual representation to help understand ${module.title.toLowerCase()} concepts intuitively.</p> | |
| <div id="${module.id}-viz" class="viz-container"> | |
| <canvas id="${module.id}-canvas" width="800" height="400" style="border: 1px solid rgba(0, 212, 255, 0.3); border-radius: 8px; background: rgba(0, 212, 255, 0.02);"></canvas> | |
| </div> | |
| <div class="viz-controls"> | |
| <button onclick="drawVisualization('${module.id}')" class="btn-viz">๐ Refresh Visualization</button> | |
| <button onclick="toggleVizAnimation('${module.id}')" class="btn-viz">โถ๏ธ Animate</button> | |
| <button onclick="downloadViz('${module.id}')" class="btn-viz">โฌ๏ธ Save Image</button> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="${module.id}-math" class="tab"> | |
| <div class="section"> | |
| <h2>๐ Mathematical Foundation</h2> | |
| ${content.math || ` | |
| <p>Rigorous mathematical treatment of ${module.title.toLowerCase()}.</p> | |
| <div class="formula"> | |
| Mathematical formulas and derivations go here | |
| </div> | |
| `} | |
| </div> | |
| </div> | |
| <div id="${module.id}-applications" class="tab"> | |
| <div class="section"> | |
| <h2>๐ Real-World Applications</h2> | |
| ${content.applications || ` | |
| <p>How ${module.title.toLowerCase()} is used in practice across different industries.</p> | |
| <div class="info-box"> | |
| <div class="box-title">Use Cases</div> | |
| <div class="box-content"> | |
| Common applications and practical examples | |
| </div> | |
| </div> | |
| `} | |
| </div> | |
| </div> | |
| <div id="${module.id}-summary" class="tab"> | |
| <div class="section"> | |
| <h2>โ Summary</h2> | |
| <div class="info-box"> | |
| <div class="box-title">Key Takeaways</div> | |
| <div class="box-content"> | |
| โ Essential concepts covered<br> | |
| โ Mathematical foundations understood<br> | |
| โ Real-world applications identified<br> | |
| โ Ready for implementation | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| `; | |
| } | |
| function initDashboard() { | |
| const grid = document.getElementById("modulesGrid"); | |
| const container = document.getElementById("modulesContainer"); | |
| modules.forEach((module, index) => { | |
| const card = document.createElement("div"); | |
| // Add staggered animation class | |
| const staggerClass = `stagger stagger-${(index % 8) + 1}`; | |
| card.className = `card hover-glow ${staggerClass}`; | |
| card.style.borderColor = module.color; | |
| card.onclick = () => switchTo(module.id + "-module"); | |
| card.innerHTML = ` | |
| <div class="card-icon">${module.icon}</div> | |
| <h3>${module.title}</h3> | |
| <p>${module.description}</p> | |
| <span class="category-label">${module.category}</span> | |
| `; | |
| grid.appendChild(card); | |
| const moduleHTML = createModuleHTML(module); | |
| container.innerHTML += moduleHTML; | |
| }); | |
| } | |
| function switchTo(target) { | |
| document.querySelectorAll('.dashboard, .module').forEach(el => { | |
| el.classList.remove('active'); | |
| }); | |
| const elem = document.getElementById(target); | |
| if (elem) elem.classList.add('active'); | |
| } | |
| function switchTab(e, tabId) { | |
| const module = e.target.closest('.module'); | |
| if (!module) return; | |
| module.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); | |
| module.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active')); | |
| const tab = document.getElementById(tabId); | |
| if (tab) tab.classList.add('active'); | |
| e.target.classList.add('active'); | |
| // Trigger visualization when tabs are clicked | |
| setTimeout(() => { | |
| const moduleId = tabId.split('-')[0]; | |
| if (tabId.includes('-concepts')) { | |
| drawConceptsVisualization(moduleId); | |
| } else if (tabId.includes('-visualization')) { | |
| drawConceptsVisualization(moduleId); | |
| } else if (tabId.includes('-math')) { | |
| drawMathVisualization(moduleId); | |
| } else if (tabId.includes('-applications')) { | |
| drawApplicationVisualization(moduleId); | |
| } | |
| }, 150); | |
| } | |
| // Visualization Functions - Concepts Tab | |
| function drawConceptsVisualization(moduleId) { | |
| const canvas = document.getElementById(moduleId + '-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| ctx.fillStyle = '#0f1419'; | |
| ctx.fillRect(0, 0, canvas.width, canvas.height); | |
| const vizMap = { | |
| 'nn-basics': drawNeuronAnimation, | |
| 'perceptron': drawDecisionBoundary, | |
| 'mlp': drawNetworkGraph, | |
| 'activation': drawActivationFunctions, | |
| 'weight-init': drawWeightDistribution, | |
| 'loss': drawLossLandscape, | |
| 'optimizers': drawConvergencePaths, | |
| 'backprop': drawGradientFlow, | |
| 'regularization': drawOverfitComparison, | |
| 'batch-norm': drawBatchNormalization, | |
| 'cv-intro': drawImageMatrix, | |
| 'conv-layer': drawConvolutionAnimation, | |
| 'pooling': drawPoolingDemo, | |
| 'cnn-basics': drawCNNArchitecture, | |
| 'viz-filters': drawLearnedFilters, | |
| 'lenet': drawLeNetArchitecture, | |
| 'alexnet': drawAlexNetArchitecture, | |
| 'vgg': drawVGGArchitecture, | |
| 'resnet': drawResNetArchitecture, | |
| 'inception': drawInceptionModule, | |
| 'mobilenet': drawMobileNetArchitecture, | |
| 'transfer-learning': drawTransferLearning, | |
| 'localization': drawBoundingBoxes, | |
| 'rcnn': drawRCNNPipeline, | |
| 'yolo': drawYOLOGrid, | |
| 'ssd': drawSSDDetector, | |
| 'semantic-seg': drawSemanticSegmentation, | |
| 'instance-seg': drawInstanceSegmentation, | |
| 'face-recog': drawFaceEmbeddings, | |
| 'autoencoders': drawAutoencoderArchitecture, | |
| 'gans': drawGANsGame, | |
| 'diffusion': drawDiffusionProcess, | |
| 'rnn': drawRNNUnrolled, | |
| 'transformers': drawAttentionMatrix, | |
| 'bert': drawBERTProcess, | |
| 'gpt': drawGPTGeneration, | |
| 'vit': drawVisionTransformer, | |
| 'gnn': drawGraphNetwork, | |
| 'seq2seq': drawSeq2SeqAttention, | |
| 'research-papers': drawDefaultVisualization | |
| }; | |
| if (vizMap[moduleId]) { | |
| vizMap[moduleId](ctx, canvas); | |
| } else { | |
| drawDefaultVisualization(ctx, canvas); | |
| } | |
| } | |
| // Default Visualization | |
| function drawDefaultVisualization(ctx, canvas) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.2)'; | |
| ctx.fillRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 18px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Interactive Visualization', centerX, centerY - 20); | |
| ctx.font = '13px Arial'; | |
| ctx.fillText('Custom visualization for this topic', centerX, centerY + 20); | |
| ctx.font = '11px Arial'; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Click Refresh to render', centerX, centerY + 45); | |
| } | |
| // Default Math Visualization | |
| function drawDefaultMathVisualization(ctx, canvas) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| ctx.fillStyle = 'rgba(255, 107, 53, 0.2)'; | |
| ctx.fillRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 18px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Mathematical Formulas', centerX, centerY - 20); | |
| ctx.font = '13px Arial'; | |
| ctx.fillText('Visual equation derivations', centerX, centerY + 20); | |
| ctx.font = '11px Arial'; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Click Visualize to render', centerX, centerY + 45); | |
| } | |
| // Default Application Visualization | |
| function drawDefaultApplicationVisualization(ctx, canvas) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| ctx.fillStyle = 'rgba(0, 255, 136, 0.2)'; | |
| ctx.fillRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(centerX - 120, centerY - 60, 240, 120); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 18px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Real-World Applications', centerX, centerY - 20); | |
| ctx.font = '13px Arial'; | |
| ctx.fillText('Practical use cases and examples', centerX, centerY + 20); | |
| ctx.font = '11px Arial'; | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Click Show Applications to render', centerX, centerY + 45); | |
| } | |
| // Activation Functions Visualization | |
| function drawActivationFunctions(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const centerX = width / 2; | |
| const centerY = height / 2; | |
| const scale = 40; | |
| // Draw grid | |
| ctx.strokeStyle = 'rgba(0, 212, 255, 0.1)'; | |
| ctx.lineWidth = 1; | |
| for (let i = -5; i <= 5; i += 1) { | |
| const x = centerX + i * scale; | |
| ctx.beginPath(); | |
| ctx.moveTo(x, centerY - 5 * scale); | |
| ctx.lineTo(x, centerY + 5 * scale); | |
| ctx.stroke(); | |
| } | |
| // Draw axes | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX - 6 * scale, centerY); | |
| ctx.lineTo(centerX + 6 * scale, centerY); | |
| ctx.stroke(); | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX, centerY - 6 * scale); | |
| ctx.lineTo(centerX, centerY + 6 * scale); | |
| ctx.stroke(); | |
| // Draw activation functions | |
| const functions = [ | |
| { name: 'ReLU', color: '#ff6b35', fn: x => Math.max(0, x) }, | |
| { name: 'Sigmoid', color: '#00ff88', fn: x => 1 / (1 + Math.exp(-x)) }, | |
| { name: 'Tanh', color: '#ffa500', fn: x => Math.tanh(x) } | |
| ]; | |
| functions.forEach(func => { | |
| ctx.strokeStyle = func.color; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| for (let x = -5; x <= 5; x += 0.1) { | |
| const y = func.fn(x); | |
| const canvasX = centerX + x * scale; | |
| const canvasY = centerY - y * scale; | |
| if (x === -5) ctx.moveTo(canvasX, canvasY); | |
| else ctx.lineTo(canvasX, canvasY); | |
| } | |
| ctx.stroke(); | |
| }); | |
| // Legend | |
| ctx.font = 'bold 12px Arial'; | |
| functions.forEach((func, i) => { | |
| ctx.fillStyle = func.color; | |
| ctx.fillRect(10, 10 + i * 20, 10, 10); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.fillText(func.name, 25, 19 + i * 20); | |
| }); | |
| } | |
| // Neural Network Graph | |
| function drawNetworkGraph(ctx, canvas) { | |
| const layers = [2, 3, 3, 1]; | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const layerWidth = width / (layers.length + 1); | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.05)'; | |
| ctx.fillRect(0, 0, width, height); | |
| // Draw neurons and connections | |
| const neuronPositions = []; | |
| layers.forEach((numNeurons, layerIdx) => { | |
| const x = (layerIdx + 1) * layerWidth; | |
| const positions = []; | |
| for (let i = 0; i < numNeurons; i++) { | |
| const y = height / (numNeurons + 1) * (i + 1); | |
| positions.push({ x, y }); | |
| // Draw connections to next layer | |
| if (layerIdx < layers.length - 1) { | |
| const nextLayerPositions = []; | |
| const nextX = (layerIdx + 2) * layerWidth; | |
| for (let j = 0; j < layers[layerIdx + 1]; j++) { | |
| const nextY = height / (layers[layerIdx + 1] + 1) * (j + 1); | |
| nextLayerPositions.push({ x: nextX, y: nextY }); | |
| } | |
| nextLayerPositions.forEach(next => { | |
| ctx.strokeStyle = 'rgba(0, 212, 255, 0.2)'; | |
| ctx.lineWidth = 1; | |
| ctx.beginPath(); | |
| ctx.moveTo(x, y); | |
| ctx.lineTo(next.x, next.y); | |
| ctx.stroke(); | |
| }); | |
| } | |
| } | |
| // Draw neurons | |
| positions.forEach(pos => { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.beginPath(); | |
| ctx.arc(pos.x, pos.y, 8, 0, Math.PI * 2); | |
| ctx.fill(); | |
| }); | |
| neuronPositions.push(positions); | |
| }); | |
| // Labels | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Input', layerWidth, height - 10); | |
| ctx.fillText('Hidden 1', layerWidth * 2, height - 10); | |
| ctx.fillText('Hidden 2', layerWidth * 3, height - 10); | |
| ctx.fillText('Output', layerWidth * 4, height - 10); | |
| } | |
| // Convolution Animation | |
| function drawConvolutionAnimation(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| // Draw input image | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.1)'; | |
| ctx.fillRect(20, 20, 150, 150); | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(20, 20, 150, 150); | |
| // Draw filter | |
| ctx.fillStyle = 'rgba(255, 107, 53, 0.1)'; | |
| const filterPos = 60 + Math.sin(Date.now() / 1000) * 40; | |
| ctx.fillRect(filterPos, 60, 60, 60); | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 3; | |
| ctx.strokeRect(filterPos, 60, 60, 60); | |
| // Draw output | |
| ctx.fillStyle = 'rgba(0, 255, 136, 0.1)'; | |
| ctx.fillRect(width - 170, 20, 150, 150); | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(width - 170, 20, 150, 150); | |
| // Draw feature map | |
| for (let i = 0; i < 5; i++) { | |
| for (let j = 0; j < 5; j++) { | |
| const intensity = Math.random() * 100; | |
| ctx.fillStyle = `rgba(0, 212, 255, ${intensity / 100})`; | |
| ctx.fillRect(width - 160 + i * 25, 30 + j * 25, 20, 20); | |
| } | |
| } | |
| // Labels | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.textAlign = 'left'; | |
| ctx.fillText('Input Image', 20, 190); | |
| ctx.fillText('Filter', filterPos, 140); | |
| ctx.fillText('Feature Map', width - 170, 190); | |
| } | |
| // Loss Landscape | |
| function drawLossLandscape(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| for (let x = 0; x < width; x += 20) { | |
| for (let y = 0; y < height; y += 20) { | |
| const nx = (x - width / 2) / (width / 4); | |
| const ny = (y - height / 2) / (height / 4); | |
| const loss = nx * nx + ny * ny; | |
| const intensity = Math.min(255, loss * 50); | |
| ctx.fillStyle = `rgb(${intensity}, ${100}, ${255 - intensity})`; | |
| ctx.fillRect(x, y, 20, 20); | |
| } | |
| } | |
| // Draw descent path | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| const startX = width / 2 + 80; | |
| const startY = height / 2 + 80; | |
| ctx.moveTo(startX, startY); | |
| for (let i = 0; i < 20; i++) { | |
| const angle = Math.atan2(startY - height / 2, startX - width / 2); | |
| const newX = startX - Math.cos(angle) * 15; | |
| const newY = startY - Math.sin(angle) * 15; | |
| ctx.lineTo(newX, newY); | |
| } | |
| ctx.stroke(); | |
| // Minimum point | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.beginPath(); | |
| ctx.arc(width / 2, height / 2, 8, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| // YOLO Grid | |
| function drawYOLOGrid(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const gridSize = 7; | |
| const cellWidth = width / gridSize; | |
| const cellHeight = height / gridSize; | |
| // Draw grid | |
| ctx.strokeStyle = 'rgba(0, 212, 255, 0.3)'; | |
| ctx.lineWidth = 1; | |
| for (let i = 0; i <= gridSize; i++) { | |
| ctx.beginPath(); | |
| ctx.moveTo(i * cellWidth, 0); | |
| ctx.lineTo(i * cellWidth, height); | |
| ctx.stroke(); | |
| ctx.beginPath(); | |
| ctx.moveTo(0, i * cellHeight); | |
| ctx.lineTo(width, i * cellHeight); | |
| ctx.stroke(); | |
| } | |
| // Draw detected objects | |
| const detections = [ | |
| { x: 2, y: 2, w: 2, h: 2, conf: 0.95 }, | |
| { x: 4, y: 5, w: 1.5, h: 1.5, conf: 0.87 } | |
| ]; | |
| detections.forEach(det => { | |
| ctx.fillStyle = `rgba(255, 107, 53, ${det.conf * 0.5})`; | |
| ctx.fillRect(det.x * cellWidth, det.y * cellHeight, det.w * cellWidth, det.h * cellHeight); | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(det.x * cellWidth, det.y * cellHeight, det.w * cellWidth, det.h * cellHeight); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.fillText((det.conf * 100).toFixed(0) + '%', det.x * cellWidth + 5, det.y * cellHeight + 15); | |
| }); | |
| } | |
| // Attention Matrix | |
| function drawAttentionMatrix(ctx, canvas) { | |
| const size = 8; | |
| const cellSize = Math.min(canvas.width, canvas.height) / size; | |
| for (let i = 0; i < size; i++) { | |
| for (let j = 0; j < size; j++) { | |
| const distance = Math.abs(i - j); | |
| const attention = Math.exp(-distance / 2); | |
| const intensity = Math.floor(attention * 255); | |
| ctx.fillStyle = `rgb(${intensity}, 100, ${200 - intensity})`; | |
| ctx.fillRect(i * cellSize, j * cellSize, cellSize, cellSize); | |
| } | |
| } | |
| // Add labels | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '10px Arial'; | |
| ctx.textAlign = 'center'; | |
| for (let i = 0; i < size; i++) { | |
| ctx.fillText('w' + i, i * cellSize + cellSize / 2, canvas.height - 5); | |
| } | |
| } | |
| // Math Visualization | |
| function drawMathVisualization(moduleId) { | |
| const canvas = document.getElementById(moduleId + '-math-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| ctx.fillStyle = '#0f1419'; | |
| ctx.fillRect(0, 0, canvas.width, canvas.height); | |
| const mathVizMap = { | |
| 'nn-basics': () => drawNNMath(ctx, canvas), | |
| 'activation': () => drawActivationDerivatives(ctx, canvas), | |
| 'loss': () => drawLossComparison(ctx, canvas), | |
| 'optimizers': () => drawOptimizerSteps(ctx, canvas), | |
| 'backprop': () => drawChainRule(ctx, canvas), | |
| 'conv-layer': () => drawConvolutionMath(ctx, canvas), | |
| 'pooling': () => drawPoolingMath(ctx, canvas), | |
| 'regularization': () => drawRegularizationMath(ctx, canvas), | |
| 'transformers': () => drawAttentionMath(ctx, canvas), | |
| 'rnn': () => drawRNNMath(ctx, canvas), | |
| 'gnn': () => drawGNNMath(ctx, canvas) | |
| }; | |
| if (mathVizMap[moduleId]) { | |
| mathVizMap[moduleId](); | |
| } else { | |
| drawDefaultMathVisualization(ctx, canvas); | |
| } | |
| } | |
| // Application Visualization | |
| function drawApplicationVisualization(moduleId) { | |
| const canvas = document.getElementById(moduleId + '-app-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| ctx.fillStyle = '#0f1419'; | |
| ctx.fillRect(0, 0, canvas.width, canvas.height); | |
| const appVizMap = { | |
| 'nn-basics': () => drawNNApplications(ctx, canvas), | |
| 'cnn-basics': () => drawCNNApplications(ctx, canvas), | |
| 'conv-layer': () => drawConvolutionApplications(ctx, canvas), | |
| 'yolo': () => drawYOLOApplications(ctx, canvas), | |
| 'semantic-seg': () => drawSegmentationApplications(ctx, canvas), | |
| 'instance-seg': () => drawInstanceSegmentationApps(ctx, canvas), | |
| 'face-recog': () => drawFaceRecognitionApps(ctx, canvas), | |
| 'transformers': () => drawTransformerApps(ctx, canvas), | |
| 'bert': () => drawBERTApplications(ctx, canvas), | |
| 'gpt': () => drawGPTApplications(ctx, canvas), | |
| 'gans': () => drawGANApplications(ctx, canvas), | |
| 'diffusion': () => drawDiffusionApplications(ctx, canvas), | |
| 'gnn': () => drawGNNApplications(ctx, canvas) | |
| }; | |
| if (appVizMap[moduleId]) { | |
| appVizMap[moduleId](); | |
| } else { | |
| drawDefaultApplicationVisualization(ctx, canvas); | |
| } | |
| } | |
| // Math visualization helper functions | |
| function drawNNMath(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 18px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Forward Pass: y = ฯ(Wx + b)', canvas.width / 2, 50); | |
| ctx.font = '14px Arial'; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Linear combination + Non-linearity', canvas.width / 2, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('W: weights, b: bias, ฯ: activation', canvas.width / 2, 150); | |
| } | |
| function drawActivationDerivatives(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const centerX = width / 2; | |
| const centerY = height / 2; | |
| const scale = 40; | |
| ctx.strokeStyle = 'rgba(0, 212, 255, 0.2)'; | |
| ctx.lineWidth = 1; | |
| for (let i = -5; i <= 5; i += 1) { | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX + i * scale, centerY - 5 * scale); | |
| ctx.lineTo(centerX + i * scale, centerY + 5 * scale); | |
| ctx.stroke(); | |
| } | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 3; | |
| ctx.beginPath(); | |
| for (let x = -5; x <= 5; x += 0.1) { | |
| const y = 1 / (1 + Math.exp(-x)) * (1 - 1 / (1 + Math.exp(-x))); | |
| const canvasX = centerX + x * scale; | |
| const canvasY = centerY - y * scale * 10; | |
| if (x === -5) ctx.moveTo(canvasX, canvasY); | |
| else ctx.lineTo(canvasX, canvasY); | |
| } | |
| ctx.stroke(); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText("Sigmoid Derivative: ฯ'(x) = ฯ(x)(1-ฯ(x))", canvas.width / 2, 30); | |
| } | |
| function drawLossComparison(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| // MSE | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.2)'; | |
| ctx.fillRect(20, 60, width / 2 - 30, height - 100); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.fillText('MSE Loss', width / 4, 45); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('L = (1/n)ฮฃ(y-ลท)ยฒ', width / 4, 90); | |
| ctx.fillText('Regression', width / 4, 115); | |
| // Cross-Entropy | |
| ctx.fillStyle = 'rgba(255, 107, 53, 0.2)'; | |
| ctx.fillRect(width / 2 + 10, 60, width / 2 - 30, height - 100); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.fillText('Cross-Entropy Loss', width * 3 / 4, 45); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('L = -ฮฃ(yยทlog(ลท))', width * 3 / 4, 90); | |
| ctx.fillText('Classification', width * 3 / 4, 115); | |
| } | |
| function drawOptimizerSteps(ctx, canvas) { | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const centerY = height / 2; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('SGD', width / 4, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('w = w - ฮฑยทโL', width / 4, 100); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.fillText('Momentum', width / 2, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('v = ฮฒยทv + (1-ฮฒ)ยทโL', width / 2, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.fillText('Adam', width * 3 / 4, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Adaptive learning rate', width * 3 / 4, 100); | |
| } | |
| function drawChainRule(ctx, canvas) { | |
| const width = canvas.width; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Backpropagation Chain Rule', width / 2, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('dL/dW = dL/dลท ยท dลท/da ยท da/dz ยท dz/dW', width / 2, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Compute gradient by multiplying partial derivatives', width / 2, 150); | |
| } | |
| function drawConvolutionMath(ctx, canvas) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Convolution Operation', canvas.width / 2, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('y[i,j] = ฮฃ ฮฃ w[m,n] * x[i+m,j+n] + b', canvas.width / 2, 100); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Sliding window element-wise multiplication and summation', canvas.width / 2, 150); | |
| } | |
| function drawPoolingMath(ctx, canvas) { | |
| const width = canvas.width; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Max Pooling', width / 3, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('y = max(neighborhood)', width / 3, 100); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.fillText('Average Pooling', width * 2 / 3, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('y = avg(neighborhood)', width * 2 / 3, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '11px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Reduces spatial dimensions', width / 2, 150); | |
| } | |
| function drawRegularizationMath(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('L1 Regularization: L = Loss + ฮปฮฃ|w|', canvas.width / 2, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('L2 Regularization: L = Loss + ฮปฮฃwยฒ', canvas.width / 2, 110); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Prevents overfitting by penalizing large weights', canvas.width / 2, 160); | |
| } | |
| function drawAttentionMath(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Attention Mechanism', canvas.width / 2, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Attention(Q,K,V) = softmax(QK^T/โd_k) ยท V', canvas.width / 2, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Query-Key matching determines how much to focus on each value', canvas.width / 2, 150); | |
| } | |
| function drawRNNMath(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('RNN Hidden State Update', canvas.width / 2, 50); | |
| ctx.font = '12px Arial'; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('h_t = ฯ(W_hยทh_(t-1) + W_xยทx_t + b)', canvas.width / 2, 100); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Processes sequences step-by-step with recurrent connections', canvas.width / 2, 150); | |
| } | |
| // Application visualization helper functions | |
| function drawNNApplications(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ฑ Stock Price Prediction', canvas.width / 4, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('๐ฅ Medical Diagnosis', canvas.width / 2, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('๐ฎ Game AI', canvas.width * 3 / 4, 60); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Fraud Detection', canvas.width / 4, 120); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('Recommendation Systems', canvas.width / 2, 120); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Credit Scoring', canvas.width * 3 / 4, 120); | |
| } | |
| function drawCNNApplications(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Image Classification', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Object Detection', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Deep Learning Backbone', canvas.width / 2, 150); | |
| } | |
| function drawConvolutionApplications(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ท Image Feature Extraction', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('๐ Edge Detection', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Foundation of Computer Vision', canvas.width / 2, 150); | |
| } | |
| function drawYOLOApplications(ctx, canvas) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Autonomous Driving', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐น Real-time Video Detection', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Ultra-fast inference for live applications', canvas.width / 2, 150); | |
| } | |
| function drawSegmentationApplications(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ฅ Medical Imaging', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('๐ Autonomous Vehicles', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Pixel-level understanding of scenes', canvas.width / 2, 150); | |
| } | |
| function drawInstanceSegmentationApps(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ฅ Person Detection & Tracking', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('๐ Object Instance Counting', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Separates overlapping objects', canvas.width / 2, 150); | |
| } | |
| function drawFaceRecognitionApps(ctx, canvas) { | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ฑ Phone Unlock', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐ Security Systems', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Identity verification and access control', canvas.width / 2, 150); | |
| } | |
| function drawTransformerApps(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ฌ ChatGPT / LLMs', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('๐ Machine Translation', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Foundation of modern NLP and beyond', canvas.width / 2, 150); | |
| } | |
| function drawBERTApplications(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Semantic Search', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('โ Question Answering', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Deep language understanding', canvas.width / 2, 150); | |
| } | |
| function drawGPTApplications(ctx, canvas) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('โ๏ธ Text Generation', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐ก Idea Assistance', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Powerful autoregressive language models', canvas.width / 2, 150); | |
| } | |
| function drawGANApplications(ctx, canvas) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐จ Image Generation', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐ญ Style Transfer', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Creative content generation and enhancement', canvas.width / 2, 150); | |
| } | |
| function drawDiffusionApplications(ctx, canvas) { | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ผ๏ธ Image Synthesis', canvas.width / 3, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐ฌ Stable Diffusion', canvas.width * 2 / 3, 60); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('State-of-the-art generative AI', canvas.width / 2, 150); | |
| } | |
| // Missing visualization stub functions | |
| function drawNeuronAnimation(ctx, canvas) { | |
| drawNetworkGraph(ctx, canvas); | |
| } | |
| function drawDecisionBoundary(ctx, canvas) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| // Draw decision boundary line | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 3; | |
| ctx.beginPath(); | |
| ctx.moveTo(0, centerY); | |
| ctx.lineTo(canvas.width, centerY); | |
| ctx.stroke(); | |
| // Draw sample points | |
| for (let i = 0; i < 20; i++) { | |
| const x = Math.random() * canvas.width; | |
| const y = Math.random() * canvas.height; | |
| ctx.fillStyle = y < centerY ? '#00d4ff' : '#00ff88'; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, 5, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| } | |
| function drawWeightDistribution(ctx, canvas) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| // Draw Gaussian distribution | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| for (let x = -100; x <= 100; x += 2) { | |
| const y = Math.exp(-(x * x) / 500) * 80; | |
| const canvasX = centerX + x; | |
| const canvasY = centerY - y; | |
| if (x === -100) ctx.moveTo(canvasX, canvasY); | |
| else ctx.lineTo(canvasX, canvasY); | |
| } | |
| ctx.stroke(); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Weight Distribution (Xavier/He Init)', centerX, 50); | |
| } | |
| function drawConvergencePaths(ctx, canvas) { | |
| drawLossLandscape(ctx, canvas); | |
| } | |
| function drawGradientFlow(ctx, canvas) { | |
| drawChainRule(ctx, canvas); | |
| } | |
| function drawOverfitComparison(ctx, canvas) { | |
| const width = canvas.width; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Without Regularization', width / 4, 40); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.fillText('With Regularization', width * 3 / 4, 40); | |
| // Draw wavy overfit line | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| for (let x = 0; x < width / 2 - 20; x += 5) { | |
| const y = 100 + Math.sin(x / 10) * 30 + Math.random() * 20; | |
| if (x === 0) ctx.moveTo(x + 20, y); | |
| else ctx.lineTo(x + 20, y); | |
| } | |
| ctx.stroke(); | |
| // Draw smooth regularized line | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.beginPath(); | |
| for (let x = 0; x < width / 2 - 20; x += 5) { | |
| const y = 100 + Math.sin(x / 20) * 15; | |
| if (x === 0) ctx.moveTo(x + width / 2 + 20, y); | |
| else ctx.lineTo(x + width / 2 + 20, y); | |
| } | |
| ctx.stroke(); | |
| } | |
| function drawBatchNormalization(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Batch Normalization: ฮผ=0, ฯยฒ=1', canvas.width / 2, 50); | |
| // Draw before/after distributions | |
| ctx.fillStyle = '#ffa500'; | |
| ctx.fillText('Input Distribution', canvas.width / 4, 100); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Normalized Distribution', canvas.width * 3 / 4, 100); | |
| } | |
| function drawImageMatrix(ctx, canvas) { | |
| const cellSize = 20; | |
| for (let i = 0; i < 10; i++) { | |
| for (let j = 0; j < 10; j++) { | |
| const intensity = Math.random(); | |
| ctx.fillStyle = `rgba(0, 212, 255, ${intensity})`; | |
| ctx.fillRect(i * cellSize + 100, j * cellSize + 100, cellSize, cellSize); | |
| } | |
| } | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Image as Matrix (Pixel Values)', canvas.width / 2, 50); | |
| } | |
| function drawPoolingDemo(ctx, canvas) { | |
| const cellSize = 30; | |
| const matrix = [[12, 20, 30, 0], [8, 12, 2, 0], [34, 70, 37, 4], [112, 100, 25, 12]]; | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Max Pooling Demo (2x2)', canvas.width / 2, 30); | |
| // Draw input matrix | |
| for (let i = 0; i < 4; i++) { | |
| for (let j = 0; j < 4; j++) { | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(50 + j * cellSize, 50 + i * cellSize, cellSize, cellSize); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '10px Arial'; | |
| ctx.fillText(matrix[i][j], 50 + j * cellSize + cellSize / 2, 50 + i * cellSize + cellSize / 2 + 4); | |
| } | |
| } | |
| // Draw output (max pooled) | |
| const pooled = [[20, 30], [112, 37]]; | |
| for (let i = 0; i < 2; i++) { | |
| for (let j = 0; j < 2; j++) { | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.strokeRect(250 + j * cellSize * 1.5, 70 + i * cellSize * 1.5, cellSize * 1.5, cellSize * 1.5); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.fillText(pooled[i][j], 250 + j * cellSize * 1.5 + cellSize * 0.75, 70 + i * cellSize * 1.5 + cellSize * 0.75 + 5); | |
| } | |
| } | |
| } | |
| function drawCNNArchitecture(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Input', 60, 200); | |
| ctx.fillText('Conv', 160, 200); | |
| ctx.fillText('Pool', 260, 200); | |
| ctx.fillText('Conv', 360, 200); | |
| ctx.fillText('Pool', 460, 200); | |
| ctx.fillText('FC', 560, 200); | |
| ctx.fillText('Output', 660, 200); | |
| // Draw blocks | |
| const blocks = [60, 160, 260, 360, 460, 560, 660]; | |
| blocks.forEach((x, i) => { | |
| const height = i === 0 ? 100 : (i < blocks.length - 2 ? 80 - i * 10 : 60); | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(x - 30, 100, 60, height); | |
| }); | |
| } | |
| function drawLearnedFilters(ctx, canvas) { | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('CNN Learned Filters', canvas.width / 2, 30); | |
| const labels = ['Edges', 'Textures', 'Patterns', 'Objects']; | |
| labels.forEach((label, i) => { | |
| const x = (i + 1) * canvas.width / 5; | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.fillText(label, x, 80); | |
| // Draw filter representation | |
| for (let j = 0; j < 3; j++) { | |
| for (let k = 0; k < 3; k++) { | |
| const intensity = Math.random(); | |
| ctx.fillStyle = `rgba(0, 212, 255, ${intensity})`; | |
| ctx.fillRect(x - 20 + k * 12, 100 + j * 12, 10, 10); | |
| } | |
| } | |
| }); | |
| } | |
| function drawLeNetArchitecture(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawAlexNetArchitecture(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawVGGArchitecture(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawResNetArchitecture(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawInceptionModule(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawMobileNetArchitecture(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawTransferLearning(ctx, canvas) { drawCNNArchitecture(ctx, canvas); } | |
| function drawBoundingBoxes(ctx, canvas) { | |
| // Draw sample image | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.1)'; | |
| ctx.fillRect(50, 50, 300, 300); | |
| // Draw bounding boxes | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 3; | |
| ctx.strokeRect(100, 100, 150, 150); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.fillText('Dog 95%', 105, 95); | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.strokeRect(180, 200, 100, 80); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('Cat 87%', 185, 195); | |
| } | |
| function drawRCNNPipeline(ctx, canvas) { drawBoundingBoxes(ctx, canvas); } | |
| function drawSSDDetector(ctx, canvas) { drawBoundingBoxes(ctx, canvas); } | |
| function drawSemanticSegmentation(ctx, canvas) { | |
| const cellSize = 15; | |
| const colors = ['rgba(0, 212, 255, 0.5)', 'rgba(255, 107, 53, 0.5)', 'rgba(0, 255, 136, 0.5)']; | |
| for (let i = 0; i < 20; i++) { | |
| for (let j = 0; j < 20; j++) { | |
| ctx.fillStyle = colors[Math.floor(Math.random() * colors.length)]; | |
| ctx.fillRect(i * cellSize + 100, j * cellSize + 50, cellSize, cellSize); | |
| } | |
| } | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Pixel-wise Classification', canvas.width / 2, 30); | |
| } | |
| function drawInstanceSegmentation(ctx, canvas) { drawSemanticSegmentation(ctx, canvas); } | |
| function drawFaceEmbeddings(ctx, canvas) { | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Face Embedding Space', canvas.width / 2, 30); | |
| // Draw embedding vectors | |
| const faces = 5; | |
| for (let i = 0; i < faces; i++) { | |
| const x = 100 + Math.random() * (canvas.width - 200); | |
| const y = 100 + Math.random() * 200; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, 10, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| } | |
| function drawAutoencoderArchitecture(ctx, canvas) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 12px Arial'; | |
| ctx.textAlign = 'center'; | |
| const stages = ['Input', 'Encoder', 'Latent', 'Decoder', 'Output']; | |
| stages.forEach((label, i) => { | |
| const x = (i + 1) * canvas.width / 6; | |
| ctx.fillText(label, x, 50); | |
| const height = i === 2 ? 40 : (i === 0 || i === 4 ? 100 : 70); | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(x - 30, 100, 60, height); | |
| }); | |
| } | |
| function drawGANsGame(ctx, canvas) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Generator', canvas.width / 3, 50); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('Discriminator', canvas.width * 2 / 3, 50); | |
| // DrawGenerator | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.strokeRect(canvas.width / 3 - 50, 100, 100, 100); | |
| // Draw Discriminator | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(canvas.width * 2 / 3 - 50, 100, 100, 100); | |
| // Draw arrow | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(canvas.width / 3 + 50, 150); | |
| ctx.lineTo(canvas.width * 2 / 3 - 50, 150); | |
| ctx.stroke(); | |
| } | |
| function drawDiffusionProcess(ctx, canvas) { | |
| const steps = 5; | |
| const stepWidth = canvas.width / (steps + 1); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Diffusion Process: From Noise to Image', canvas.width / 2, 30); | |
| for (let i = 0; i < steps; i++) { | |
| const x = (i + 1) * stepWidth; | |
| const noise = 1 - (i / steps); | |
| ctx.fillStyle = `rgba(0, 212, 255, ${1 - noise})`; | |
| ctx.fillRect(x - 40, 100, 80, 80); | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(x - 40, 100, 80, 80); | |
| } | |
| } | |
| function drawRNNUnrolled(ctx, canvas) { | |
| const cells = 5; | |
| const cellWidth = canvas.width / (cells + 1); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Unrolled RNN', canvas.width / 2, 30); | |
| for (let i = 0; i < cells; i++) { | |
| const x = (i + 1) * cellWidth; | |
| ctx.strokeStyle = '#00d4ff'; | |
| ctx.strokeRect(x - 30, 100, 60, 60); | |
| if (i < cells - 1) { | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(x + 30, 130); | |
| ctx.lineTo(x + cellWidth - 30, 130); | |
| ctx.stroke(); | |
| } | |
| } | |
| } | |
| function drawBERTProcess(ctx, canvas) { drawAttentionMatrix(ctx, canvas); } | |
| function drawGPTGeneration(ctx, canvas) { drawAttentionMatrix(ctx, canvas); } | |
| function drawVisionTransformer(ctx, canvas) { drawAttentionMatrix(ctx, canvas); } | |
| function drawVisualization(moduleId) { | |
| drawConceptsVisualization(moduleId); | |
| } | |
| // Animation and download utilities | |
| let animationFrameId = null; | |
| function toggleVizAnimation(moduleId) { | |
| const btn = event.target; | |
| window.vizAnimating = !window.vizAnimating; | |
| if (window.vizAnimating) { | |
| btn.textContent = 'โน๏ธ Stop'; | |
| btn.style.background = 'linear-gradient(135deg, #ff4444, #cc0000)'; | |
| animateVisualization(moduleId); | |
| } else { | |
| btn.textContent = 'โถ๏ธ Animate'; | |
| btn.style.background = ''; | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| animationFrameId = null; | |
| } | |
| } | |
| } | |
| function animateVisualization(moduleId) { | |
| if (!window.vizAnimating) return; | |
| const canvas = document.getElementById(moduleId + '-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| ctx.fillStyle = '#0f1419'; | |
| ctx.fillRect(0, 0, canvas.width, canvas.height); | |
| // Call the appropriate animated drawing function | |
| const animatedVizMap = { | |
| 'nn-basics': drawAnimatedNetwork, | |
| 'perceptron': drawAnimatedDecisionBoundary, | |
| 'mlp': drawAnimatedMLP, | |
| 'activation': drawAnimatedActivations, | |
| 'conv-layer': drawAnimatedConvolution, | |
| 'gnn': drawAnimatedGNN, | |
| 'transformers': drawAnimatedAttention, | |
| 'backprop': drawAnimatedGradientFlow, | |
| 'gans': drawAnimatedGAN, | |
| 'diffusion': drawAnimatedDiffusion, | |
| 'rnn': drawAnimatedRNN | |
| }; | |
| if (animatedVizMap[moduleId]) { | |
| animatedVizMap[moduleId](ctx, canvas, Date.now()); | |
| } else { | |
| // Default animation - pulsing visualization | |
| drawDefaultAnimation(ctx, canvas, Date.now()); | |
| } | |
| animationFrameId = requestAnimationFrame(() => animateVisualization(moduleId)); | |
| } | |
| // Default animation for modules without specific animations | |
| function drawDefaultAnimation(ctx, canvas, time) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| const pulse = Math.sin(time / 300) * 0.3 + 0.7; | |
| // Animated neural network | |
| const layers = [3, 4, 4, 2]; | |
| const layerWidth = canvas.width / (layers.length + 1); | |
| layers.forEach((neurons, layerIdx) => { | |
| const x = (layerIdx + 1) * layerWidth; | |
| const layerHeight = canvas.height / (neurons + 1); | |
| for (let i = 0; i < neurons; i++) { | |
| const y = (i + 1) * layerHeight; | |
| const radius = 12 + Math.sin(time / 200 + layerIdx + i) * 3; | |
| // Draw neuron | |
| ctx.fillStyle = `rgba(0, 212, 255, ${pulse})`; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, radius, 0, Math.PI * 2); | |
| ctx.fill(); | |
| // Draw connections to next layer | |
| if (layerIdx < layers.length - 1) { | |
| const nextLayerHeight = canvas.height / (layers[layerIdx + 1] + 1); | |
| const nextX = (layerIdx + 2) * layerWidth; | |
| for (let j = 0; j < layers[layerIdx + 1]; j++) { | |
| const nextY = (j + 1) * nextLayerHeight; | |
| const signalProgress = ((time / 500) + layerIdx * 0.5) % 1; | |
| ctx.strokeStyle = `rgba(0, 212, 255, ${0.3 + signalProgress * 0.3})`; | |
| ctx.lineWidth = 1; | |
| ctx.beginPath(); | |
| ctx.moveTo(x + radius, y); | |
| ctx.lineTo(nextX - 12, nextY); | |
| ctx.stroke(); | |
| // Animated signal dot | |
| const dotX = x + radius + (nextX - 12 - x - radius) * signalProgress; | |
| const dotY = y + (nextY - y) * signalProgress; | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.beginPath(); | |
| ctx.arc(dotX, dotY, 3, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| } | |
| } | |
| }); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Neural Network Animation', centerX, 25); | |
| } | |
| // Animated GNN with message passing | |
| function drawAnimatedGNN(ctx, canvas, time) { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Graph Neural Network - Message Passing', canvas.width / 2, 30); | |
| const nodes = [ | |
| { x: 100, y: 100 }, { x: 200, y: 60 }, { x: 320, y: 120 }, | |
| { x: 150, y: 200 }, { x: 400, y: 80 }, { x: 450, y: 180 } | |
| ]; | |
| const edges = [[0, 1], [0, 3], [1, 2], [1, 4], [2, 3], [2, 4], [4, 5]]; | |
| // Draw edges | |
| ctx.strokeStyle = 'rgba(153, 0, 255, 0.4)'; | |
| ctx.lineWidth = 2; | |
| edges.forEach(e => { | |
| ctx.beginPath(); | |
| ctx.moveTo(nodes[e[0]].x, nodes[e[0]].y); | |
| ctx.lineTo(nodes[e[1]].x, nodes[e[1]].y); | |
| ctx.stroke(); | |
| }); | |
| // Draw animated message passing | |
| const messageProgress = (time / 1000) % 1; | |
| ctx.fillStyle = '#00ff88'; | |
| edges.forEach((e, idx) => { | |
| const progress = (messageProgress + idx * 0.15) % 1; | |
| const x = nodes[e[0]].x + (nodes[e[1]].x - nodes[e[0]].x) * progress; | |
| const y = nodes[e[0]].y + (nodes[e[1]].y - nodes[e[0]].y) * progress; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, 5, 0, Math.PI * 2); | |
| ctx.fill(); | |
| }); | |
| // Draw nodes with pulse | |
| const pulse = Math.sin(time / 300) * 5 + 15; | |
| nodes.forEach((n, i) => { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.beginPath(); | |
| ctx.arc(n.x, n.y, pulse, 0, Math.PI * 2); | |
| ctx.fill(); | |
| ctx.fillStyle = 'white'; | |
| ctx.font = '12px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText(i, n.x, n.y + 4); | |
| }); | |
| } | |
| // Animated attention matrix | |
| function drawAnimatedAttention(ctx, canvas, time) { | |
| const words = ['The', 'cat', 'sat', 'on', 'mat']; | |
| const cellSize = 50; | |
| const startX = (canvas.width - words.length * cellSize) / 2; | |
| const startY = 80; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Self-Attention Animation', canvas.width / 2, 30); | |
| // Draw words | |
| ctx.font = '12px Arial'; | |
| words.forEach((word, i) => { | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.fillText(word, startX + i * cellSize + cellSize / 2, startY - 10); | |
| ctx.save(); | |
| ctx.translate(startX - 20, startY + i * cellSize + cellSize / 2); | |
| ctx.fillText(word, 0, 0); | |
| ctx.restore(); | |
| }); | |
| // Animated attention weights | |
| for (let i = 0; i < words.length; i++) { | |
| for (let j = 0; j < words.length; j++) { | |
| const baseWeight = i === j ? 0.8 : 0.2 + Math.abs(i - j) * 0.1; | |
| const animatedWeight = baseWeight + Math.sin(time / 500 + i + j) * 0.2; | |
| const alpha = Math.max(0.1, Math.min(1, animatedWeight)); | |
| ctx.fillStyle = `rgba(0, 212, 255, ${alpha})`; | |
| ctx.fillRect(startX + j * cellSize + 2, startY + i * cellSize + 2, cellSize - 4, cellSize - 4); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '10px Arial'; | |
| ctx.fillText(animatedWeight.toFixed(2), startX + j * cellSize + cellSize / 2, startY + i * cellSize + cellSize / 2 + 4); | |
| } | |
| } | |
| } | |
| // Animated gradient flow for backprop | |
| function drawAnimatedGradientFlow(ctx, canvas, time) { | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Backpropagation - Gradient Flow', canvas.width / 2, 30); | |
| const layers = [2, 4, 4, 1]; | |
| const layerWidth = canvas.width / (layers.length + 1); | |
| // Forward pass (left to right) - blue | |
| const forwardProgress = (time / 2000) % 1; | |
| layers.forEach((neurons, layerIdx) => { | |
| const x = (layerIdx + 1) * layerWidth; | |
| const layerHeight = canvas.height / (neurons + 1); | |
| for (let i = 0; i < neurons; i++) { | |
| const y = (i + 1) * layerHeight; | |
| // Pulse effect based on forward pass | |
| const isActive = forwardProgress > layerIdx / layers.length; | |
| const radius = isActive ? 15 + Math.sin(time / 200) * 3 : 12; | |
| ctx.fillStyle = isActive ? '#00d4ff' : 'rgba(0, 212, 255, 0.3)'; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, radius, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| }); | |
| // Backward pass (right to left) - orange/red gradients | |
| const backwardProgress = ((time / 2000) + 0.5) % 1; | |
| for (let layerIdx = layers.length - 2; layerIdx >= 0; layerIdx--) { | |
| const x1 = (layerIdx + 1) * layerWidth; | |
| const x2 = (layerIdx + 2) * layerWidth; | |
| const gradientActive = backwardProgress > (layers.length - 2 - layerIdx) / (layers.length - 1); | |
| if (gradientActive) { | |
| const gradX = x2 - (x2 - x1) * ((backwardProgress * (layers.length - 1)) % 1); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.beginPath(); | |
| ctx.arc(gradX, canvas.height / 2, 8, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| } | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Forward: Blue โ | Backward: Orange โ', canvas.width / 2, canvas.height - 20); | |
| } | |
| // Animated network for nn-basics | |
| function drawAnimatedNetwork(ctx, canvas, time) { | |
| drawDefaultAnimation(ctx, canvas, time); | |
| } | |
| // Animated decision boundary for perceptron | |
| function drawAnimatedDecisionBoundary(ctx, canvas, time) { | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Perceptron Decision Boundary', canvas.width / 2, 30); | |
| // Animated rotating decision boundary | |
| const angle = time / 2000; | |
| const length = 200; | |
| ctx.strokeStyle = '#ff6b35'; | |
| ctx.lineWidth = 3; | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX - Math.cos(angle) * length, centerY - Math.sin(angle) * length); | |
| ctx.lineTo(centerX + Math.cos(angle) * length, centerY + Math.sin(angle) * length); | |
| ctx.stroke(); | |
| // Fixed sample points | |
| const points = [ | |
| { x: 100, y: 80, c: 1 }, { x: 150, y: 100, c: 1 }, { x: 120, y: 150, c: 1 }, | |
| { x: 400, y: 200, c: 0 }, { x: 450, y: 180, c: 0 }, { x: 380, y: 250, c: 0 } | |
| ]; | |
| points.forEach(p => { | |
| ctx.fillStyle = p.c === 1 ? '#00d4ff' : '#00ff88'; | |
| ctx.beginPath(); | |
| ctx.arc(p.x, p.y, 8, 0, Math.PI * 2); | |
| ctx.fill(); | |
| }); | |
| } | |
| function drawAnimatedMLP(ctx, canvas, time) { | |
| drawDefaultAnimation(ctx, canvas, time); | |
| } | |
| function drawAnimatedActivations(ctx, canvas, time) { | |
| drawActivationFunctions(ctx, canvas); | |
| // Add animated input marker | |
| const x = Math.sin(time / 500) * 4; | |
| const centerX = canvas.width / 2; | |
| const centerY = canvas.height / 2; | |
| const scale = 40; | |
| ctx.fillStyle = '#ffffff'; | |
| ctx.beginPath(); | |
| ctx.arc(centerX + x * scale, centerY, 6, 0, Math.PI * 2); | |
| ctx.fill(); | |
| ctx.strokeStyle = '#ffffff'; | |
| ctx.setLineDash([5, 5]); | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX + x * scale, 0); | |
| ctx.lineTo(centerX + x * scale, canvas.height); | |
| ctx.stroke(); | |
| ctx.setLineDash([]); | |
| } | |
| function drawAnimatedConvolution(ctx, canvas, time) { | |
| drawConvolutionAnimation(ctx, canvas); | |
| } | |
| function drawAnimatedGAN(ctx, canvas, time) { | |
| ctx.fillStyle = '#ffaa00'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('GAN Training Animation', canvas.width / 2, 30); | |
| const phase = Math.floor(time / 1000) % 4; | |
| // Generator | |
| ctx.fillStyle = phase <= 1 ? '#00ff88' : 'rgba(0, 255, 136, 0.3)'; | |
| ctx.fillRect(50, 100, 100, 80); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Generator', 100, 145); | |
| // Fake image | |
| const noiseToFake = Math.sin(time / 300) * 0.5 + 0.5; | |
| ctx.fillStyle = `rgba(255, 170, 0, ${noiseToFake})`; | |
| ctx.fillRect(200, 110, 60, 60); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.fillText('Fake', 230, 200); | |
| // Discriminator | |
| ctx.fillStyle = phase >= 2 ? '#ff6b35' : 'rgba(255, 107, 53, 0.3)'; | |
| ctx.fillRect(320, 100, 100, 80); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.fillText('Discriminator', 370, 145); | |
| // Output | |
| const output = phase === 3 ? 'Real?' : 'Fake?'; | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 14px Arial'; | |
| ctx.fillText(output, 370, 220); | |
| // Arrows | |
| ctx.strokeStyle = '#e4e6eb'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(150, 140); | |
| ctx.lineTo(200, 140); | |
| ctx.stroke(); | |
| ctx.beginPath(); | |
| ctx.moveTo(260, 140); | |
| ctx.lineTo(320, 140); | |
| ctx.stroke(); | |
| } | |
| function drawAnimatedDiffusion(ctx, canvas, time) { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Diffusion Process Animation', canvas.width / 2, 30); | |
| const steps = 5; | |
| const stepWidth = canvas.width / (steps + 1); | |
| const progress = (time / 3000) % 1; | |
| const currentStep = Math.floor(progress * steps); | |
| for (let i = 0; i < steps; i++) { | |
| const x = (i + 1) * stepWidth; | |
| const y = 150; | |
| const noiseLevel = i / (steps - 1); | |
| const isActive = i <= currentStep; | |
| // Draw square with noise | |
| ctx.fillStyle = isActive ? '#9900ff' : 'rgba(153, 0, 255, 0.3)'; | |
| ctx.fillRect(x - 30, y - 30, 60, 60); | |
| // Add noise dots | |
| if (noiseLevel > 0) { | |
| for (let j = 0; j < noiseLevel * 20; j++) { | |
| const nx = x - 25 + Math.random() * 50; | |
| const ny = y - 25 + Math.random() * 50; | |
| ctx.fillStyle = 'rgba(255, 255, 255, 0.5)'; | |
| ctx.fillRect(nx, ny, 2, 2); | |
| } | |
| } | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '10px Arial'; | |
| ctx.fillText(`t=${i}`, x, y + 50); | |
| } | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText('Clean โ Noisy (Forward) | Noisy โ Clean (Reverse)', canvas.width / 2, canvas.height - 20); | |
| } | |
| function drawAnimatedRNN(ctx, canvas, time) { | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('RNN Unrolled Through Time', canvas.width / 2, 30); | |
| const steps = 5; | |
| const stepWidth = canvas.width / (steps + 1); | |
| const progress = (time / 500) % steps; | |
| const activeStep = Math.floor(progress); | |
| for (let i = 0; i < steps; i++) { | |
| const x = (i + 1) * stepWidth; | |
| const y = 150; | |
| const isActive = i === activeStep; | |
| // Hidden state | |
| ctx.fillStyle = isActive ? '#00d4ff' : 'rgba(0, 212, 255, 0.3)'; | |
| ctx.beginPath(); | |
| ctx.arc(x, y, 25, 0, Math.PI * 2); | |
| ctx.fill(); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '10px Arial'; | |
| ctx.fillText(`h${i}`, x, y + 4); | |
| // Input arrow | |
| ctx.strokeStyle = isActive ? '#00ff88' : 'rgba(0, 255, 136, 0.3)'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(x, y + 60); | |
| ctx.lineTo(x, y + 25); | |
| ctx.stroke(); | |
| ctx.fillText(`x${i}`, x, y + 75); | |
| // Recurrent connection | |
| if (i < steps - 1) { | |
| ctx.strokeStyle = isActive ? '#ff6b35' : 'rgba(255, 107, 53, 0.3)'; | |
| ctx.beginPath(); | |
| ctx.moveTo(x + 25, y); | |
| ctx.lineTo(x + stepWidth - 25, y); | |
| ctx.stroke(); | |
| // Animated signal | |
| if (isActive) { | |
| const signalX = x + 25 + (stepWidth - 50) * (progress % 1); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.beginPath(); | |
| ctx.arc(signalX, y, 5, 0, Math.PI * 2); | |
| ctx.fill(); | |
| } | |
| } | |
| } | |
| } | |
| function downloadViz(moduleId) { | |
| const canvas = document.getElementById(moduleId + '-canvas'); | |
| if (!canvas) return; | |
| const link = document.createElement('a'); | |
| link.href = canvas.toDataURL('image/png'); | |
| link.download = moduleId + '-visualization.png'; | |
| link.click(); | |
| } | |
| function drawGraphNetwork(ctx, canvas) { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Graph Structure & Message Passing', canvas.width / 2, 30); | |
| const nodes = [ | |
| { x: 100, y: 100 }, { x: 200, y: 50 }, { x: 300, y: 150 }, | |
| { x: 150, y: 250 }, { x: 400, y: 100 }, { x: 500, y: 200 } | |
| ]; | |
| const edges = [ | |
| [0, 1], [0, 3], [1, 2], [1, 4], [2, 3], [2, 4], [4, 5] | |
| ]; | |
| // Draw edges | |
| ctx.strokeStyle = 'rgba(153, 0, 255, 0.4)'; | |
| ctx.lineWidth = 2; | |
| edges.forEach(e => { | |
| ctx.beginPath(); | |
| ctx.moveTo(nodes[e[0]].x, nodes[e[0]].y); | |
| ctx.lineTo(nodes[e[1]].x, nodes[e[1]].y); | |
| ctx.stroke(); | |
| }); | |
| // Draw nodes | |
| nodes.forEach((n, i) => { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.beginPath(); | |
| ctx.arc(n.x, n.y, 15, 0, Math.PI * 2); | |
| ctx.fill(); | |
| ctx.fillStyle = 'white'; | |
| ctx.font = '12px Arial'; | |
| ctx.fillText(i, n.x, n.y + 4); | |
| }); | |
| // Draw Message Passing Animation (fake) | |
| const t = (Date.now() / 1000) % 2; | |
| if (t > 1) { | |
| ctx.strokeStyle = '#00ff88'; | |
| ctx.lineWidth = 4; | |
| edges.forEach((e, idx) => { | |
| if (idx % 2 === 0) { | |
| ctx.beginPath(); | |
| ctx.moveTo(nodes[e[0]].x, nodes[e[0]].y); | |
| ctx.lineTo(nodes[e[1]].x, nodes[e[1]].y); | |
| ctx.stroke(); | |
| } | |
| }); | |
| } | |
| } | |
| function drawGNNMath(ctx, canvas) { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('Graph Convolution Math', canvas.width / 2, 50); | |
| ctx.fillStyle = '#e4e6eb'; | |
| ctx.font = '14px Courier New'; | |
| ctx.fillText('H(l+1) = ฯ(D^-ยฝ A D^-ยฝ H(l) W(l))', canvas.width / 2, 100); | |
| ctx.fillStyle = '#00ff88'; | |
| ctx.fillText('A = Neighborhood Connections', canvas.width / 2, 150); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.fillText('D = Normalization Factor', canvas.width / 2, 180); | |
| } | |
| function drawGNNApplications(ctx, canvas) { | |
| ctx.fillStyle = '#9900ff'; | |
| ctx.font = 'bold 16px Arial'; | |
| ctx.textAlign = 'center'; | |
| ctx.fillText('๐ Drug Discovery (Molecular Graphs)', canvas.width / 2, 60); | |
| ctx.fillStyle = '#00d4ff'; | |
| ctx.fillText('๐ Traffic Flow Prediction', canvas.width / 2, 120); | |
| ctx.fillStyle = '#ff6b35'; | |
| ctx.fillText('๐ Pinterest/Amazon Recommendations', canvas.width / 2, 180); | |
| } | |
| initDashboard(); | |
| </script> | |
| </body> | |
| </html> |