Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Machine Learning: Complete Educational Guide</title> | |
| <script src="https://cdn.jsdelivr.net/npm/chart.js"></script> | |
| <link rel="stylesheet" href="style.css"> | |
| <style> | |
| @font-face { | |
| font-family: 'FKGroteskNeue'; | |
| src: url('https://r2cdn.perplexity.ai/fonts/FKGroteskNeue.woff2') format('woff2'); | |
| } | |
| * { | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| } | |
| html { | |
| scroll-behavior: smooth; | |
| } | |
| body { | |
| font-family: 'FKGroteskNeue', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; | |
| background: #1a2332; | |
| color: #a9b4c2; | |
| line-height: 1.6; | |
| font-size: 16px; | |
| } | |
| .guide-container { | |
| display: flex; | |
| min-height: 100vh; | |
| } | |
| /* Sidebar */ | |
| .toc-sidebar { | |
| width: 280px; | |
| background: #0b0f14; | |
| border-right: 1px solid #2a3544; | |
| position: fixed; | |
| height: 100vh; | |
| overflow-y: auto; | |
| z-index: 100; | |
| scroll-behavior: smooth; | |
| } | |
| .toc-header { | |
| padding: 32px 24px; | |
| border-bottom: 1px solid #2a3544; | |
| } | |
| .toc-header h1 { | |
| font-size: 24px; | |
| font-weight: 600; | |
| color: #e8eef6; | |
| margin-bottom: 8px; | |
| } | |
| .toc-subtitle { | |
| font-size: 14px; | |
| color: #7ef0d4; | |
| } | |
| .toc-nav { | |
| padding: 16px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 8px; | |
| } | |
| .toc-link { | |
| display: block; | |
| padding: 12px 16px; | |
| color: #a9b4c2; | |
| text-decoration: none; | |
| border-radius: 8px; | |
| transition: all 0.2s; | |
| font-size: 14px; | |
| } | |
| .toc-link:hover { | |
| background: #2a3544; | |
| color: #e8eef6; | |
| } | |
| .toc-link.active { | |
| background: #6aa9ff; | |
| color: #0b0f14; | |
| font-weight: 600; | |
| } | |
| /* Main Content */ | |
| .content-main { | |
| margin-left: 280px; | |
| flex: 1; | |
| padding: 48px 64px; | |
| max-width: 1400px; | |
| } | |
| .content-header { | |
| margin-bottom: 48px; | |
| } | |
| .content-header h1 { | |
| font-size: 42px; | |
| font-weight: 700; | |
| color: #e8eef6; | |
| margin-bottom: 16px; | |
| } | |
| .content-header p { | |
| font-size: 18px; | |
| color: #7ef0d4; | |
| } | |
| /* Sections */ | |
| .section { | |
| background: #111823; | |
| border: 1px solid #2a3544; | |
| border-radius: 12px; | |
| margin-bottom: 24px; | |
| overflow: hidden; | |
| } | |
| .section-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| padding: 24px 32px; | |
| cursor: pointer; | |
| background: #111823; | |
| border-bottom: 1px solid #2a3544; | |
| transition: background 0.2s; | |
| } | |
| .section-header:hover { | |
| background: #1a2332; | |
| } | |
| .section-header h2 { | |
| font-size: 28px; | |
| font-weight: 600; | |
| color: #e8eef6; | |
| } | |
| .section-toggle { | |
| background: none; | |
| border: none; | |
| color: #6aa9ff; | |
| font-size: 24px; | |
| cursor: pointer; | |
| transition: transform 0.3s; | |
| padding: 8px; | |
| } | |
| .section-toggle.collapsed { | |
| transform: rotate(-90deg); | |
| } | |
| .section-body { | |
| padding: 32px; | |
| display: none; | |
| } | |
| .section-body.expanded { | |
| display: block; | |
| } | |
| .section-body p { | |
| margin-bottom: 16px; | |
| font-size: 17px; | |
| line-height: 1.7; | |
| } | |
| .section-body h3 { | |
| font-size: 22px; | |
| font-weight: 600; | |
| color: #e8eef6; | |
| margin: 32px 0 16px 0; | |
| } | |
| .section-body ul { | |
| margin: 16px 0; | |
| padding-left: 24px; | |
| } | |
| .section-body li { | |
| margin-bottom: 12px; | |
| line-height: 1.6; | |
| } | |
| .section-body ol { | |
| margin: 16px 0; | |
| padding-left: 24px; | |
| } | |
| .section-body ol li { | |
| margin-bottom: 16px; | |
| } | |
| /* Info Cards */ | |
| .info-card { | |
| background: #2a3544; | |
| border: 1px solid #3a4554; | |
| border-radius: 10px; | |
| padding: 24px; | |
| margin: 24px 0; | |
| } | |
| .info-card-title { | |
| font-size: 16px; | |
| font-weight: 600; | |
| color: #7ef0d4; | |
| margin-bottom: 16px; | |
| } | |
| .info-card-list { | |
| list-style: none; | |
| padding: 0; | |
| } | |
| .info-card-list li { | |
| padding: 8px 0; | |
| border-bottom: 1px solid #3a4554; | |
| color: #a9b4c2; | |
| } | |
| .info-card-list li:last-child { | |
| border-bottom: none; | |
| } | |
| .info-card-list li:before { | |
| content: "✓ "; | |
| color: #7ef0d4; | |
| font-weight: bold; | |
| margin-right: 8px; | |
| } | |
| /* Formulas */ | |
| .formula { | |
| background: #0b0f14; | |
| border: 1px solid #2a3544; | |
| border-left: 4px solid #6aa9ff; | |
| border-radius: 8px; | |
| padding: 20px; | |
| margin: 24px 0; | |
| font-family: 'Courier New', monospace; | |
| font-size: 16px; | |
| color: #e8eef6; | |
| overflow-x: auto; | |
| } | |
| .formula strong { | |
| display: block; | |
| color: #7ef0d4; | |
| margin-bottom: 12px; | |
| font-size: 14px; | |
| } | |
| .formula small { | |
| display: block; | |
| color: #a9b4c2; | |
| font-size: 14px; | |
| margin-top: 12px; | |
| } | |
| /* Callouts */ | |
| .callout { | |
| border-radius: 10px; | |
| padding: 20px; | |
| margin: 24px 0; | |
| border-left: 4px solid; | |
| } | |
| .callout.info { | |
| background: rgba(106, 169, 255, 0.1); | |
| border-left-color: #6aa9ff; | |
| } | |
| .callout.warning { | |
| background: rgba(255, 140, 106, 0.1); | |
| border-left-color: #ff8c6a; | |
| } | |
| .callout.success { | |
| background: rgba(126, 240, 212, 0.1); | |
| border-left-color: #7ef0d4; | |
| } | |
| .callout-title { | |
| font-size: 16px; | |
| font-weight: 600; | |
| color: #e8eef6; | |
| margin-bottom: 12px; | |
| } | |
| .callout-content { | |
| color: #a9b4c2; | |
| line-height: 1.6; | |
| } | |
| /* Figures */ | |
| .figure { | |
| margin: 32px 0; | |
| } | |
| .figure-placeholder { | |
| background: #0b0f14; | |
| border: 1px solid #2a3544; | |
| border-radius: 10px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| position: relative; | |
| } | |
| .figure-caption { | |
| margin-top: 12px; | |
| font-size: 14px; | |
| color: #7ef0d4; | |
| text-align: center; | |
| } | |
| /* Controls */ | |
| .controls { | |
| background: #2a3544; | |
| border-radius: 10px; | |
| padding: 24px; | |
| margin: 24px 0; | |
| } | |
| .control-group { | |
| margin-bottom: 20px; | |
| } | |
| .control-group:last-child { | |
| margin-bottom: 0; | |
| } | |
| .control-group label { | |
| display: block; | |
| font-size: 14px; | |
| font-weight: 600; | |
| color: #e8eef6; | |
| margin-bottom: 12px; | |
| } | |
| input[type="range"] { | |
| width: 100%; | |
| height: 6px; | |
| border-radius: 3px; | |
| background: #1a2332; | |
| outline: none; | |
| -webkit-appearance: none; | |
| } | |
| input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance: none; | |
| width: 18px; | |
| height: 18px; | |
| border-radius: 50%; | |
| background: #6aa9ff; | |
| cursor: pointer; | |
| } | |
| input[type="range"]::-moz-range-thumb { | |
| width: 18px; | |
| height: 18px; | |
| border-radius: 50%; | |
| background: #6aa9ff; | |
| cursor: pointer; | |
| border: none; | |
| } | |
| .btn { | |
| display: inline-block; | |
| padding: 12px 24px; | |
| border-radius: 8px; | |
| font-size: 14px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| border: none; | |
| transition: all 0.2s; | |
| } | |
| .btn-primary { | |
| background: #6aa9ff; | |
| color: #0b0f14; | |
| } | |
| .btn-primary:hover { | |
| background: #5a99ef; | |
| } | |
| .btn-secondary { | |
| background: #2a3544; | |
| color: #e8eef6; | |
| } | |
| .btn-secondary:hover { | |
| background: #3a4554; | |
| } | |
| /* Tables */ | |
| .data-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 24px 0; | |
| } | |
| .data-table th, | |
| .data-table td { | |
| padding: 12px; | |
| text-align: left; | |
| border-bottom: 1px solid #2a3544; | |
| } | |
| .data-table th { | |
| background: #2a3544; | |
| color: #e8eef6; | |
| font-weight: 600; | |
| font-size: 14px; | |
| } | |
| .data-table td { | |
| color: #a9b4c2; | |
| } | |
| .data-table tbody tr:hover { | |
| background: rgba(106, 169, 255, 0.05); | |
| } | |
| /* Canvas */ | |
| canvas { | |
| max-width: 100%; | |
| height: auto; | |
| display: block; | |
| } | |
| /* Badge */ | |
| .badge { | |
| display: inline-block; | |
| padding: 4px 12px; | |
| border-radius: 12px; | |
| font-size: 12px; | |
| font-weight: 600; | |
| background: #2a3544; | |
| color: #7ef0d4; | |
| margin-right: 8px; | |
| } | |
| /* Step boxes */ | |
| .step { | |
| background: #2a3544; | |
| border-left: 4px solid #6aa9ff; | |
| border-radius: 8px; | |
| padding: 20px; | |
| margin: 20px 0; | |
| } | |
| .step-title { | |
| font-size: 16px; | |
| font-weight: 600; | |
| color: #7ef0d4; | |
| margin-bottom: 12px; | |
| } | |
| .step-calculation { | |
| font-family: 'Courier New', monospace; | |
| font-size: 14px; | |
| line-height: 1.8; | |
| color: #e8eef6; | |
| white-space: pre-wrap; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 1024px) { | |
| .toc-sidebar { | |
| width: 240px; | |
| } | |
| .content-main { | |
| margin-left: 240px; | |
| padding: 32px; | |
| } | |
| } | |
| @media (max-width: 768px) { | |
| .toc-sidebar { | |
| width: 100%; | |
| position: relative; | |
| height: auto; | |
| } | |
| .content-main { | |
| margin-left: 0; | |
| padding: 24px 16px; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="guide-container"> | |
| <!-- Left Sidebar - Table of Contents --> | |
| <aside class="toc-sidebar"> | |
| <div class="toc-header"> | |
| <h1>Machine Learning</h1> | |
| <p class="toc-subtitle">Complete Learning Guide</p> | |
| </div> | |
| <nav class="toc-nav"> | |
| <a href="#intro" class="toc-link">📚 Introduction</a> | |
| <div class="toc-category"> | |
| <div class="toc-category-header" data-category="supervised"> | |
| <span class="category-icon">📊</span> | |
| <span class="category-title">SUPERVISED LEARNING</span> | |
| <span class="category-toggle">▼</span> | |
| </div> | |
| <div class="toc-category-content" id="supervised-content"> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Regression</div> | |
| <a href="#linear-regression" class="toc-link toc-sub">Linear Regression</a> | |
| <a href="#polynomial-regression" class="toc-link toc-sub">Polynomial Regression</a> | |
| <a href="#gradient-descent" class="toc-link toc-sub">Gradient Descent</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Classification</div> | |
| <a href="#logistic-regression" class="toc-link toc-sub">Logistic Regression</a> | |
| <a href="#svm" class="toc-link toc-sub">Support Vector Machines</a> | |
| <a href="#knn" class="toc-link toc-sub">K-Nearest Neighbors</a> | |
| <a href="#naive-bayes" class="toc-link toc-sub">Naive Bayes</a> | |
| <a href="#decision-tree-regression" class="toc-link toc-sub">Decision Tree Regression</a> | |
| <a href="#decision-trees" class="toc-link toc-sub">Decision Trees (Classification)</a> | |
| <a href="#bagging" class="toc-link toc-sub">Bagging</a> | |
| <a href="#boosting-adaboost" class="toc-link toc-sub">Boosting (AdaBoost)</a> | |
| <a href="#gradient-boosting" class="toc-link toc-sub">Gradient Boosting (Regression)</a> | |
| <a href="#gradient-boosting-classification" class="toc-link toc-sub">Gradient Boosting | |
| (Classification)</a> | |
| <a href="#xgboost" class="toc-link toc-sub">XGBoost (Regression)</a> | |
| <a href="#xgboost-classification" class="toc-link toc-sub">XGBoost (Classification)</a> | |
| <a href="#random-forest" class="toc-link toc-sub">Random Forest</a> | |
| <a href="#ensemble-methods" class="toc-link toc-sub">Ensemble Methods Overview</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Evaluation & Tuning</div> | |
| <a href="#model-evaluation" class="toc-link toc-sub">Model Evaluation</a> | |
| <a href="#cross-validation" class="toc-link toc-sub">Cross-Validation</a> | |
| <a href="#optimal-k" class="toc-link toc-sub">Finding Optimal K</a> | |
| <a href="#hyperparameter-tuning" class="toc-link toc-sub">Hyperparameter Tuning</a> | |
| <a href="#regularization" class="toc-link toc-sub">Regularization</a> | |
| <a href="#bias-variance" class="toc-link toc-sub">Bias-Variance Tradeoff</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Neural Networks</div> | |
| <a href="#perceptron" class="toc-link toc-sub">Perceptron</a> | |
| <a href="#neural-networks" class="toc-link toc-sub">Multi-Layer Perceptron (MLP)</a> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="toc-category"> | |
| <div class="toc-category-header" data-category="unsupervised"> | |
| <span class="category-icon">🔍</span> | |
| <span class="category-title">UNSUPERVISED LEARNING</span> | |
| <span class="category-toggle">▼</span> | |
| </div> | |
| <div class="toc-category-content" id="unsupervised-content"> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Clustering</div> | |
| <a href="#kmeans" class="toc-link toc-sub">K-means Clustering</a> | |
| <a href="#hierarchical-clustering" class="toc-link toc-sub">Hierarchical Clustering</a> | |
| <a href="#dbscan" class="toc-link toc-sub">DBSCAN Clustering</a> | |
| <a href="#clustering-evaluation" class="toc-link toc-sub">Clustering Evaluation</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Preprocessing</div> | |
| <a href="#preprocessing" class="toc-link toc-sub">Data Preprocessing</a> | |
| <a href="#loss-functions" class="toc-link toc-sub">Loss Functions</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Dimensionality Reduction</div> | |
| <a href="#pca" class="toc-link toc-sub">Principal Component Analysis (PCA)</a> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="toc-category"> | |
| <div class="toc-category-header" data-category="reinforcement"> | |
| <span class="category-icon">🎮</span> | |
| <span class="category-title">REINFORCEMENT LEARNING</span> | |
| <span class="category-toggle">▼</span> | |
| </div> | |
| <div class="toc-category-content" id="reinforcement-content"> | |
| <a href="#rl-intro" class="toc-link toc-sub">RL Introduction</a> | |
| <a href="#q-learning" class="toc-link toc-sub">Q-Learning</a> | |
| <a href="#policy-gradient" class="toc-link toc-sub">Policy Gradient</a> | |
| </div> | |
| </div> | |
| <div class="toc-category"> | |
| <div class="toc-category-header" data-category="nlp"> | |
| <span class="category-icon">🗣️</span> | |
| <span class="category-title">NLP & GENAI</span> | |
| <span class="category-toggle">▼</span> | |
| </div> | |
| <div class="toc-category-content" id="nlp-content"> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Basic NLP</div> | |
| <a href="#nlp-preprocessing" class="toc-link toc-sub">Text Preprocessing</a> | |
| <a href="#word-embeddings" class="toc-link toc-sub">Word Embeddings (Word2Vec)</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Advanced NLP</div> | |
| <a href="#rnn-lstm" class="toc-link toc-sub">RNN & LSTM</a> | |
| <a href="#transformers" class="toc-link toc-sub">Transformers</a> | |
| </div> | |
| <div class="toc-subcategory"> | |
| <div class="toc-subcategory-title">Generative AI</div> | |
| <a href="#genai-intro" class="toc-link toc-sub">GenAI & LLMs</a> | |
| <a href="#vectordb-rag" class="toc-link toc-sub">VectorDB & RAG</a> | |
| </div> | |
| </div> | |
| </div> | |
| <a href="#algorithm-comparison" class="toc-link">📊 Algorithm Comparison</a> | |
| </nav> | |
| </aside> | |
| <!-- Main Content Area --> | |
| <main class="content-main"> | |
| <div class="content-header"> | |
| <h1>Machine Learning: The Ultimate Learning Platform</h1> | |
| <p style="font-size: 18px; margin-bottom: 16px;">Master ML through <strong | |
| style="color: #6aa9ff;">Supervised</strong>, <strong | |
| style="color: #7ef0d4;">Unsupervised</strong> & <strong | |
| style="color: #ff8c6a;">Reinforcement Learning</strong></p> | |
| <p style="font-size: 16px; color: #a9b4c2;">Complete with step-by-step mathematical solutions, | |
| interactive visualizations, and real-world examples</p> | |
| </div> | |
| <!-- ======================================== | |
| INTRODUCTION SECTION | |
| ======================================== --> | |
| <!-- Section 1: Introduction to Machine Learning --> | |
| <div class="section" id="intro"> | |
| <div class="section-header"> | |
| <h2>1. Introduction to Machine Learning</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Machine Learning is teaching computers to learn from experience, just like humans do. Instead of | |
| programming every rule, we let the computer discover patterns in data and make decisions on its | |
| own.</p> | |
| <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 20px; margin: 32px 0;"> | |
| <div | |
| style="background: rgba(106, 169, 255, 0.1); border: 2px solid #6aa9ff; border-radius: 12px; padding: 24px; text-align: center;"> | |
| <div style="font-size: 48px; margin-bottom: 12px;">📊</div> | |
| <h4 style="color: #6aa9ff; margin-bottom: 8px;">Supervised Learning</h4> | |
| <p style="font-size: 14px; color: #a9b4c2; margin: 0;">Learning with labeled data - like a | |
| teacher providing answers</p> | |
| <div style="margin-top: 12px; font-size: 12px; color: #7ef0d4;"> | |
| <div>✓ Regression</div> | |
| <div>✓ Classification</div> | |
| <div>✓ Evaluation</div> | |
| </div> | |
| </div> | |
| <div | |
| style="background: rgba(126, 240, 212, 0.1); border: 2px solid #7ef0d4; border-radius: 12px; padding: 24px; text-align: center;"> | |
| <div style="font-size: 48px; margin-bottom: 12px;">🔍</div> | |
| <h4 style="color: #7ef0d4; margin-bottom: 8px;">Unsupervised Learning</h4> | |
| <p style="font-size: 14px; color: #a9b4c2; margin: 0;">Finding patterns without labels - | |
| discovering hidden structure</p> | |
| <div style="margin-top: 12px; font-size: 12px; color: #7ef0d4;"> | |
| <div>✓ Clustering</div> | |
| <div>✓ Dimensionality Reduction</div> | |
| <div>✓ Preprocessing</div> | |
| </div> | |
| </div> | |
| <div | |
| style="background: rgba(255, 140, 106, 0.1); border: 2px solid #ff8c6a; border-radius: 12px; padding: 24px; text-align: center;"> | |
| <div style="font-size: 48px; margin-bottom: 12px;">🎮</div> | |
| <h4 style="color: #ff8c6a; margin-bottom: 8px;">Reinforcement Learning</h4> | |
| <p style="font-size: 14px; color: #a9b4c2; margin: 0;">Learning through trial & error - | |
| maximizing rewards</p> | |
| <div style="margin-top: 12px; font-size: 12px; color: #7ef0d4;"> | |
| <div>✓ Q-Learning</div> | |
| <div>✓ Policy Gradient</div> | |
| <div>✓ Applications</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Learning from data instead of explicit programming</li> | |
| <li>Three types: Supervised, Unsupervised, Reinforcement</li> | |
| <li>Powers Netflix recommendations, Face ID, and more</li> | |
| <li>Requires: Data, Algorithm, and Computing Power</li> | |
| </ul> | |
| </div> | |
| <h3>Understanding Machine Learning</h3> | |
| <p>Imagine teaching a child to recognize animals. You show them pictures of cats and dogs, telling | |
| them which is which. After seeing many examples, the child learns to identify new animals | |
| they've never seen before. Machine Learning works the same way!</p> | |
| <p><strong>The Three Types of Learning:</strong></p> | |
| <ol> | |
| <li><strong>Supervised Learning:</strong> Learning with a teacher. You provide labeled examples | |
| (like "this is a cat", "this is a dog"), and the model learns to predict labels for new | |
| data.</li> | |
| <li><strong>Unsupervised Learning:</strong> Learning without labels. The model finds hidden | |
| patterns on its own, like grouping similar customers together.</li> | |
| <li><strong>Reinforcement Learning:</strong> Learning by trial and error. The model tries | |
| actions and learns from rewards/punishments, like teaching a robot to walk.</li> | |
| </ol> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Key Insight</div> | |
| <div class="callout-content"> | |
| ML is not magic! It's mathematics + statistics + computer science working together to find | |
| patterns in data. | |
| </div> | |
| </div> | |
| <h3>Real-World Applications</h3> | |
| <ul> | |
| <li><strong>Netflix:</strong> Recommends shows based on what you've watched</li> | |
| <li><strong>Face ID:</strong> Recognizes your face to unlock your phone</li> | |
| <li><strong>Gmail:</strong> Filters spam emails automatically</li> | |
| <li><strong>Google Maps:</strong> Predicts traffic and suggests fastest routes</li> | |
| <li><strong>Voice Assistants:</strong> Understands and responds to your speech</li> | |
| </ul> | |
| <div class="callout success"> | |
| <div class="callout-title">✓ Why ML Matters Today</div> | |
| <div class="callout-content"> | |
| We generate 2.5 quintillion bytes of data every day! ML helps make sense of this massive | |
| data to solve problems that were impossible before. | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 2: Linear Regression --> | |
| <div class="section" id="linear-regression"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Regression</span> Linear Regression</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Linear Regression is one of the simplest and most powerful techniques for predicting continuous | |
| values. It finds the "best fit line" through data points.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Predicts continuous values (prices, temperatures, etc.)</li> | |
| <li>Finds the straight line that best fits the data</li> | |
| <li>Uses equation: y = mx + c</li> | |
| <li>Minimizes prediction errors</li> | |
| </ul> | |
| </div> | |
| <h3>Understanding Linear Regression</h3> | |
| <p>Think of it like this: You want to predict house prices based on size. If you plot size vs. price | |
| on a graph, you'll see points scattered around. Linear regression draws the "best" line through | |
| these points that you can use to predict prices for houses of any size.</p> | |
| <div class="formula"> | |
| <strong>The Linear Equation:</strong> | |
| y = mx + c | |
| <br><small>where:<br>y = predicted value (output)<br>x = input feature<br>m = slope (how steep | |
| the line is)<br>c = intercept (where line crosses y-axis)</small> | |
| </div> | |
| <h3>Example: Predicting Salary from Experience</h3> | |
| <p>Let's say we have data about employees' years of experience and their salaries:</p> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Experience (years)</th> | |
| <th>Salary ($k)</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>1</td> | |
| <td>39.8</td> | |
| </tr> | |
| <tr> | |
| <td>2</td> | |
| <td>48.9</td> | |
| </tr> | |
| <tr> | |
| <td>3</td> | |
| <td>57.0</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>68.3</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>77.9</td> | |
| </tr> | |
| <tr> | |
| <td>6</td> | |
| <td>85.0</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p>We can find a line (y = 7.5x + 32) that predicts: Someone with 7 years experience will earn | |
| approximately $84.5k.</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px; position: relative;"> | |
| <canvas id="lr-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Scatter plot showing experience vs. salary | |
| with the best fit line</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Adjust Slope (m): <span id="slope-val">7.5</span></label> | |
| <input type="range" id="slope-slider" min="0" max="15" step="0.5" value="7.5"> | |
| </div> | |
| <div class="control-group"> | |
| <label>Adjust Intercept (c): <span id="intercept-val">32</span></label> | |
| <input type="range" id="intercept-slider" min="0" max="60" step="1" value="32"> | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Cost Function (Mean Squared Error):</strong> | |
| MSE = Σ(y_actual - y_predicted)² / n | |
| <br><small>This measures how wrong our predictions are. Lower MSE = better fit!</small> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Key Insight</div> | |
| <div class="callout-content"> | |
| The "best fit line" is the one that minimizes the total error between actual points and | |
| predicted points. We square the errors so positive and negative errors don't cancel out. | |
| </div> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Common Mistake</div> | |
| <div class="callout-content"> | |
| Linear regression assumes a straight-line relationship. If your data curves, you need | |
| polynomial regression or other techniques! | |
| </div> | |
| </div> | |
| <h3>Step-by-Step Process</h3> | |
| <ol> | |
| <li>Collect data with input (x) and output (y) pairs</li> | |
| <li>Plot the points on a graph</li> | |
| <li>Find values of m and c that minimize prediction errors</li> | |
| <li>Use the equation y = mx + c to predict new values</li> | |
| </ol> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(106, 169, 255, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #6aa9ff; margin-top: 32px;"> | |
| <h3 style="color: #6aa9ff; margin-bottom: 20px;">📐 Complete Mathematical Derivation</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's solve this step-by-step with actual numbers | |
| using our salary data!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Organize Our Data</strong><br><br> | |
| Our data points (Experience x, Salary y):<br> | |
| (1, 39.8), (2, 48.9), (3, 57.0), (4, 68.3), (5, 77.9), (6, 85.0)<br><br> | |
| Number of data points: <strong style="color: #7ef0d4;">n = 6</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Calculate Means (x̄ and ȳ)</strong><br><br> | |
| <strong>Mean of x (x̄):</strong><br> | |
| x̄ = (x₁ + x₂ + x₃ + x₄ + x₅ + x₆) / n<br> | |
| x̄ = (1 + 2 + 3 + 4 + 5 + 6) / 6<br> | |
| x̄ = 21 / 6<br> | |
| <strong style="color: #7ef0d4;">x̄ = 3.5</strong><br><br> | |
| <strong>Mean of y (ȳ):</strong><br> | |
| ȳ = (y₁ + y₂ + y₃ + y₄ + y₅ + y₆) / n<br> | |
| ȳ = (39.8 + 48.9 + 57.0 + 68.3 + 77.9 + 85.0) / 6<br> | |
| ȳ = 376.9 / 6<br> | |
| <strong style="color: #7ef0d4;">ȳ = 62.82</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Calculate Slope (m) Using the | |
| Formula</strong><br><br> | |
| <strong>Formula for slope:</strong><br> | |
| m = Σ[(xᵢ - x̄)(yᵢ - ȳ)] / Σ[(xᵢ - x̄)²]<br><br> | |
| <strong>Calculate numerator (sum of products of deviations):</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 1px solid #3a4556;"> | |
| <th style="padding: 8px; text-align: left;">xᵢ</th> | |
| <th style="padding: 8px; text-align: left;">yᵢ</th> | |
| <th style="padding: 8px; text-align: left;">xᵢ - x̄</th> | |
| <th style="padding: 8px; text-align: left;">yᵢ - ȳ</th> | |
| <th style="padding: 8px; text-align: left;">(xᵢ - x̄)(yᵢ - ȳ)</th> | |
| <th style="padding: 8px; text-align: left;">(xᵢ - x̄)²</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">1</td> | |
| <td>39.8</td> | |
| <td>-2.5</td> | |
| <td>-23.02</td> | |
| <td style="color: #7ef0d4;">57.54</td> | |
| <td>6.25</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">2</td> | |
| <td>48.9</td> | |
| <td>-1.5</td> | |
| <td>-13.92</td> | |
| <td style="color: #7ef0d4;">20.88</td> | |
| <td>2.25</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">3</td> | |
| <td>57.0</td> | |
| <td>-0.5</td> | |
| <td>-5.82</td> | |
| <td style="color: #7ef0d4;">2.91</td> | |
| <td>0.25</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">4</td> | |
| <td>68.3</td> | |
| <td>0.5</td> | |
| <td>5.48</td> | |
| <td style="color: #7ef0d4;">2.74</td> | |
| <td>0.25</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">5</td> | |
| <td>77.9</td> | |
| <td>1.5</td> | |
| <td>15.08</td> | |
| <td style="color: #7ef0d4;">22.62</td> | |
| <td>2.25</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">6</td> | |
| <td>85.0</td> | |
| <td>2.5</td> | |
| <td>22.18</td> | |
| <td style="color: #7ef0d4;">55.46</td> | |
| <td>6.25</td> | |
| </tr> | |
| <tr style="border-top: 2px solid #6aa9ff; font-weight: bold;"> | |
| <td colspan="4" style="padding: 8px;">Sum:</td> | |
| <td style="color: #ff8c6a;">162.15</td> | |
| <td style="color: #ff8c6a;">17.50</td> | |
| </tr> | |
| </table> | |
| <strong>Calculate m:</strong><br> | |
| m = 162.15 / 17.50<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">m = 9.27 (salary increases by $9.27k per | |
| year of experience)</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Calculate Intercept (c)</strong><br><br> | |
| <strong>Formula:</strong> c = ȳ - m × x̄<br><br> | |
| c = 62.82 - (9.27 × 3.5)<br> | |
| c = 62.82 - 32.45<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">c = 30.37 (base salary with 0 years | |
| experience)</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 5: Our Final Equation!</strong><br><br> | |
| <strong style="color: #7ef0d4; font-size: 20px;">ŷ = 9.27x + 30.37</strong><br><br> | |
| <strong>Make a Prediction:</strong> What salary for 7 years of experience?<br> | |
| ŷ = 9.27 × 7 + 30.37<br> | |
| ŷ = 64.89 + 30.37<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">ŷ = $95.26k predicted salary</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 6: Calculate MSE (How Good is Our | |
| Model?)</strong><br><br> | |
| <strong>For each point, calculate (actual - predicted)²:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 1px solid #3a4556;"> | |
| <th style="padding: 8px;">x</th> | |
| <th style="padding: 8px;">Actual y</th> | |
| <th style="padding: 8px;">Predicted ŷ</th> | |
| <th style="padding: 8px;">Error (y - ŷ)</th> | |
| <th style="padding: 8px;">Error²</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">1</td> | |
| <td>39.8</td> | |
| <td>39.64</td> | |
| <td>0.16</td> | |
| <td style="color: #7ef0d4;">0.03</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">2</td> | |
| <td>48.9</td> | |
| <td>48.91</td> | |
| <td>-0.01</td> | |
| <td style="color: #7ef0d4;">0.00</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">3</td> | |
| <td>57.0</td> | |
| <td>58.18</td> | |
| <td>-1.18</td> | |
| <td style="color: #7ef0d4;">1.39</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">4</td> | |
| <td>68.3</td> | |
| <td>67.45</td> | |
| <td>0.85</td> | |
| <td style="color: #7ef0d4;">0.72</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">5</td> | |
| <td>77.9</td> | |
| <td>76.72</td> | |
| <td>1.18</td> | |
| <td style="color: #7ef0d4;">1.39</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">6</td> | |
| <td>85.0</td> | |
| <td>85.99</td> | |
| <td>-0.99</td> | |
| <td style="color: #7ef0d4;">0.98</td> | |
| </tr> | |
| <tr style="border-top: 2px solid #6aa9ff; font-weight: bold;"> | |
| <td colspan="4" style="padding: 8px;">Sum of Squared Errors:</td> | |
| <td style="color: #ff8c6a;">4.51</td> | |
| </tr> | |
| </table> | |
| MSE = Sum of Squared Errors / n<br> | |
| MSE = 4.51 / 6<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">MSE = 0.75 (Very low - great fit!)</strong> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ What We Learned</div> | |
| <div class="callout-content"> | |
| <strong>The Math Summary:</strong><br> | |
| 1. m (slope) = Σ[(x-x̄)(y-ȳ)] / Σ[(x-x̄)²] = <strong>9.27</strong><br> | |
| 2. c (intercept) = ȳ - m×x̄ = <strong>30.37</strong><br> | |
| 3. Final equation: <strong>ŷ = 9.27x + 30.37</strong><br> | |
| 4. MSE = 0.75 (low error = good model!) | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: Polynomial Regression (NEW) --> | |
| <div class="section" id="polynomial-regression"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Regression</span> Polynomial Regression</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>When your data curves and a straight line won't fit, Polynomial Regression extends linear | |
| regression by adding polynomial terms (x², x³, etc.) to capture non-linear relationships.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Extends linear regression to fit curves</li> | |
| <li>Uses polynomial features: x, x², x³, etc.</li> | |
| <li>Higher degree = more flexible (but beware overfitting!)</li> | |
| <li>Still linear in parameters (coefficients)</li> | |
| </ul> | |
| </div> | |
| <h3>When Linear Fails</h3> | |
| <p>Consider predicting car stopping distance based on speed. The relationship isn't linear - | |
| doubling speed quadruples stopping distance (physics: kinetic energy = ½mv²)!</p> | |
| <div class="formula"> | |
| <strong>Linear:</strong> y = β₀ + β₁x (straight line)<br><br> | |
| <strong>Polynomial Degree 2:</strong> y = β₀ + β₁x + β₂x²<br> | |
| <strong>Polynomial Degree 3:</strong> y = β₀ + β₁x + β₂x² + β₃x³<br> | |
| <strong>Polynomial Degree n:</strong> y = β₀ + β₁x + β₂x² + ... + βₙxⁿ | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Overfitting Warning!</div> | |
| <div class="callout-content"> | |
| <strong>Degree 2-3:</strong> Usually safe, captures curves<br> | |
| <strong>Degree 4-5:</strong> Can start overfitting<br> | |
| <strong>Degree > 5:</strong> High risk of overfitting - the model memorizes noise! | |
| </div> | |
| </div> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(106, 169, 255, 0.1), rgba(255, 140, 106, 0.1)); border: 2px solid #6aa9ff; margin-top: 32px;"> | |
| <h3 style="color: #6aa9ff; margin-bottom: 20px;">📐 Complete Mathematical Derivation</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's fit a quadratic curve to data step-by-step! | |
| </p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Predict stopping distance from | |
| speed</strong><br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Speed x (mph)</th> | |
| <th style="padding: 8px;">Stopping Distance y (ft)</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">10</td> | |
| <td style="text-align: center;">15</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">20</td> | |
| <td style="text-align: center;">40</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">30</td> | |
| <td style="text-align: center;">80</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">40</td> | |
| <td style="text-align: center;">130</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">50</td> | |
| <td style="text-align: center;">200</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Create Polynomial Features</strong><br><br> | |
| For degree 2, we add x² as a new feature:<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">x (speed)</th> | |
| <th style="padding: 8px;">x² (speed squared)</th> | |
| <th style="padding: 8px;">y (distance)</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">10</td> | |
| <td style="text-align: center; color: #7ef0d4;">100</td> | |
| <td style="text-align: center;">15</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">20</td> | |
| <td style="text-align: center; color: #7ef0d4;">400</td> | |
| <td style="text-align: center;">40</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">30</td> | |
| <td style="text-align: center; color: #7ef0d4;">900</td> | |
| <td style="text-align: center;">80</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">40</td> | |
| <td style="text-align: center; color: #7ef0d4;">1600</td> | |
| <td style="text-align: center;">130</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">50</td> | |
| <td style="text-align: center; color: #7ef0d4;">2500</td> | |
| <td style="text-align: center;">200</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Matrix Form (Design Matrix)</strong><br><br> | |
| <strong>Model:</strong> y = β₀ + β₁x + β₂x²<br><br> | |
| <strong>Design Matrix X:</strong><br> | |
| <pre style="color: #e8eef6; background: none; border: none; padding: 0;"> | |
| [1 10 100 ] [15 ] [β₀] | |
| [1 20 400 ] [40 ] [β₁] | |
| X = [1 30 900 ] y = [80 ] β = [β₂] | |
| [1 40 1600] [130] | |
| [1 50 2500] [200]</pre> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Solve Using Normal Equation</strong><br><br> | |
| <strong>Normal Equation:</strong> β = (XᵀX)⁻¹ Xᵀy<br><br> | |
| After matrix multiplication (done by computer):<br><br> | |
| <strong style="color: #7ef0d4;">β₀ = 2.5</strong> (base distance)<br> | |
| <strong style="color: #7ef0d4;">β₁ = 0.5</strong> (linear component)<br> | |
| <strong style="color: #7ef0d4;">β₂ = 0.07</strong> (quadratic component)<br> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Final Equation</strong><br><br> | |
| <strong style="color: #7ef0d4; font-size: 20px;">ŷ = 2.5 + 0.5x + 0.07x²</strong><br><br> | |
| <strong>Make Predictions:</strong><br> | |
| Speed = 25 mph: ŷ = 2.5 + 0.5(25) + 0.07(625) = 2.5 + 12.5 + 43.75 = <strong | |
| style="color: #7ef0d4;">58.75 ft</strong><br> | |
| Speed = 60 mph: ŷ = 2.5 + 0.5(60) + 0.07(3600) = 2.5 + 30 + 252 = <strong | |
| style="color: #7ef0d4;">284.5 ft</strong> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Key Points</div> | |
| <div class="callout-content"> | |
| <strong>Polynomial Regression Summary:</strong><br> | |
| 1. Create polynomial features: x → [x, x², x³, ...]<br> | |
| 2. Apply standard linear regression on expanded features<br> | |
| 3. The model is still "linear" in parameters, just non-linear in input<br> | |
| 4. Use cross-validation to choose optimal degree! | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.preprocessing <span style="color: #ff8c6a;">import</span> PolynomialFeatures | |
| <span style="color: #ff8c6a;">from</span> sklearn.linear_model <span style="color: #ff8c6a;">import</span> LinearRegression | |
| <span style="color: #6aa9ff;"># Create polynomial features (degree 2)</span> | |
| poly = PolynomialFeatures(degree=<span style="color: #7ef0d4;">2</span>) | |
| X_poly = poly.fit_transform(X) | |
| <span style="color: #6aa9ff;"># Fit linear regression on polynomial features</span> | |
| model = LinearRegression() | |
| model.fit(X_poly, y) | |
| <span style="color: #6aa9ff;"># Predict</span> | |
| y_pred = model.predict(poly.transform(X_new))</pre> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 3: Gradient Descent --> | |
| <div class="section" id="gradient-descent"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Optimization</span> Gradient Descent</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Gradient Descent is the optimization algorithm that helps us find the best values for our model | |
| parameters (like m and c in linear regression). Think of it as rolling a ball downhill to find | |
| the lowest point.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Optimization algorithm to minimize loss function</li> | |
| <li>Takes small steps in the direction of steepest descent</li> | |
| <li>Learning rate controls step size</li> | |
| <li>Stops when it reaches the minimum (convergence)</li> | |
| </ul> | |
| </div> | |
| <h3>Understanding Gradient Descent</h3> | |
| <p>Imagine you're hiking down a mountain in thick fog. You can't see the bottom, but you can feel | |
| the slope under your feet. The smart strategy? Always step in the steepest downward direction. | |
| That's exactly what gradient descent does with mathematical functions!</p> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 The Mountain Analogy</div> | |
| <div class="callout-content"> | |
| Your position on the mountain = current parameter values (m, c)<br> | |
| Your altitude = loss/error<br> | |
| Goal = reach the valley (minimum loss)<br> | |
| Gradient = tells you which direction is steepest | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Gradient Descent Update Rule:</strong> | |
| θ_new = θ_old - α × ∇J(θ) | |
| <br><small>where:<br>θ = parameters (m, c)<br>α = learning rate (step size)<br>∇J(θ) = gradient | |
| (direction and steepness)</small> | |
| </div> | |
| <h3>The Learning Rate (α)</h3> | |
| <p>The learning rate is like your step size when walking down the mountain:</p> | |
| <ul> | |
| <li><strong>Too small:</strong> You take tiny steps and it takes forever to reach the bottom | |
| </li> | |
| <li><strong>Too large:</strong> You take huge leaps and might jump over the valley or even go | |
| uphill!</li> | |
| <li><strong>Just right:</strong> You make steady progress toward the minimum</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px; position: relative;"> | |
| <canvas id="gd-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Loss surface showing gradient descent path | |
| to minimum</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Learning Rate: <span id="lr-val">0.1</span></label> | |
| <input type="range" id="lr-slider" min="0.01" max="1" step="0.01" value="0.1"> | |
| </div> | |
| <div class="control-group"> | |
| <button class="btn btn-primary" id="run-gd">Run Gradient Descent</button> | |
| <button class="btn btn-secondary" id="reset-gd">Reset</button> | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Gradients for Linear Regression:</strong> | |
| ∂MSE/∂m = (2/n) × Σ(ŷ - y) × x<br> | |
| ∂MSE/∂c = (2/n) × Σ(ŷ - y) | |
| <br><small>These tell us how much to adjust m and c</small> | |
| </div> | |
| <h3>Types of Gradient Descent</h3> | |
| <ol> | |
| <li><strong>Batch Gradient Descent:</strong> Uses all data points for each update. Accurate but | |
| slow for large datasets.</li> | |
| <li><strong>Stochastic Gradient Descent (SGD):</strong> Uses one random data point per update. | |
| Fast but noisy.</li> | |
| <li><strong>Mini-batch Gradient Descent:</strong> Uses small batches (e.g., 32 points). Best of | |
| both worlds!</li> | |
| </ol> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Watch Out!</div> | |
| <div class="callout-content"> | |
| Gradient descent can get stuck in local minima (small valleys) instead of finding the global | |
| minimum (deepest valley). This is more common with complex, non-convex loss functions. | |
| </div> | |
| </div> | |
| <h3>Convergence Criteria</h3> | |
| <p>How do we know when to stop? We stop when:</p> | |
| <ul> | |
| <li>Loss stops decreasing significantly (e.g., change < 0.0001)</li> | |
| <li>Gradients become very small (near zero)</li> | |
| <li>We reach maximum iterations (e.g., 1000 steps)</li> | |
| </ul> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(255, 140, 106, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #ff8c6a; margin-top: 32px;"> | |
| <h3 style="color: #ff8c6a; margin-bottom: 20px;">📐 Complete Mathematical Derivation: Gradient | |
| Descent in Action</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's watch gradient descent optimize a simple | |
| example step-by-step!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem Setup: Finding the Minimum of f(x) = | |
| x²</strong><br><br> | |
| We want to find the value of x that minimizes f(x) = x²<br><br> | |
| <strong>Settings:</strong><br> | |
| • Starting point: x₀ = <strong style="color: #7ef0d4;">4</strong><br> | |
| • Learning rate: α = <strong style="color: #7ef0d4;">0.3</strong><br> | |
| • Goal: Find x that minimizes x² (answer should be x = 0) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Calculate the Gradient (Derivative)</strong><br><br> | |
| The gradient tells us which direction increases the function.<br><br> | |
| f(x) = x²<br> | |
| f'(x) = d/dx (x²) = <strong style="color: #7ef0d4;">2x</strong><br><br> | |
| <strong>Why 2x?</strong><br> | |
| Using the power rule: d/dx (xⁿ) = n × xⁿ⁻¹<br> | |
| So: d/dx (x²) = 2 × x²⁻¹ = 2 × x¹ = 2x | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Apply the Update Rule Iteratively</strong><br><br> | |
| <strong>Update Formula:</strong> x_new = x_old - α × f'(x_old)<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px; text-align: center;">Iteration</th> | |
| <th style="padding: 10px; text-align: center;">x_old</th> | |
| <th style="padding: 10px; text-align: center;">f'(x) = 2x</th> | |
| <th style="padding: 10px; text-align: center;">α × f'(x)</th> | |
| <th style="padding: 10px; text-align: center;">x_new = x_old - α×f'(x)</th> | |
| <th style="padding: 10px; text-align: center;">f(x) = x²</th> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.1);"> | |
| <td style="padding: 8px; text-align: center;"><strong>0 (Start)</strong></td> | |
| <td style="text-align: center;">4.000</td> | |
| <td style="text-align: center;">—</td> | |
| <td style="text-align: center;">—</td> | |
| <td style="text-align: center;">—</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>16.00</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;"><strong>1</strong></td> | |
| <td style="text-align: center;">4.000</td> | |
| <td style="text-align: center;">2×4 = 8</td> | |
| <td style="text-align: center;">0.3×8 = 2.4</td> | |
| <td style="text-align: center; color: #7ef0d4;">4 - 2.4 = <strong>1.600</strong> | |
| </td> | |
| <td style="text-align: center;">2.56</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;"><strong>2</strong></td> | |
| <td style="text-align: center;">1.600</td> | |
| <td style="text-align: center;">2×1.6 = 3.2</td> | |
| <td style="text-align: center;">0.3×3.2 = 0.96</td> | |
| <td style="text-align: center; color: #7ef0d4;">1.6 - 0.96 = <strong>0.640</strong> | |
| </td> | |
| <td style="text-align: center;">0.41</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;"><strong>3</strong></td> | |
| <td style="text-align: center;">0.640</td> | |
| <td style="text-align: center;">2×0.64 = 1.28</td> | |
| <td style="text-align: center;">0.3×1.28 = 0.384</td> | |
| <td style="text-align: center; color: #7ef0d4;">0.64 - 0.384 = | |
| <strong>0.256</strong> | |
| </td> | |
| <td style="text-align: center;">0.066</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;"><strong>4</strong></td> | |
| <td style="text-align: center;">0.256</td> | |
| <td style="text-align: center;">2×0.256 = 0.512</td> | |
| <td style="text-align: center;">0.3×0.512 = 0.154</td> | |
| <td style="text-align: center; color: #7ef0d4;">0.256 - 0.154 = | |
| <strong>0.102</strong> | |
| </td> | |
| <td style="text-align: center;">0.010</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;"><strong>5</strong></td> | |
| <td style="text-align: center;">0.102</td> | |
| <td style="text-align: center;">2×0.102 = 0.205</td> | |
| <td style="text-align: center;">0.3×0.205 = 0.061</td> | |
| <td style="text-align: center; color: #7ef0d4;">0.102 - 0.061 = | |
| <strong>0.041</strong> | |
| </td> | |
| <td style="text-align: center;">0.002</td> | |
| </tr> | |
| <tr style="border-top: 2px solid #7ef0d4; background: rgba(126, 240, 212, 0.1);"> | |
| <td style="padding: 8px; text-align: center;"><strong>...</strong></td> | |
| <td style="text-align: center;">→</td> | |
| <td style="text-align: center;">→</td> | |
| <td style="text-align: center;">→</td> | |
| <td style="text-align: center; color: #7ef0d4; font-size: 16px;"><strong>≈ | |
| 0</strong></td> | |
| <td style="text-align: center; color: #7ef0d4; font-size: 16px;"><strong>≈ | |
| 0</strong></td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Applying to Linear Regression</strong><br><br> | |
| For linear regression y = mx + c, we minimize MSE:<br> | |
| <strong>MSE = (1/n) × Σ(yᵢ - (mxᵢ + c))²</strong><br><br> | |
| <strong>Partial derivatives (gradients):</strong><br> | |
| ∂MSE/∂m = (-2/n) × Σ xᵢ(yᵢ - ŷᵢ)<br> | |
| ∂MSE/∂c = (-2/n) × Σ (yᵢ - ŷᵢ)<br><br> | |
| <strong>Update rules:</strong><br> | |
| m_new = m_old - α × ∂MSE/∂m<br> | |
| c_new = c_old - α × ∂MSE/∂c<br><br> | |
| <em style="color: #a9b4c2;">Each iteration brings m and c closer to optimal values!</em> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Key Insight</div> | |
| <div class="callout-content"> | |
| <strong>Watch what happens:</strong><br> | |
| • Started at x = 4, loss = 16<br> | |
| • After 5 iterations: x ≈ 0.041, loss ≈ 0.002<br> | |
| • <strong style="color: #7ef0d4;">The loss dropped from 16 to 0.002 in just 5 | |
| steps!</strong><br><br> | |
| This is the power of gradient descent - it automatically finds the minimum by following | |
| the steepest path downhill! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 4: Logistic Regression --> | |
| <div class="section" id="logistic-regression"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> Logistic Regression</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Logistic Regression is used for binary classification - when you want to predict categories | |
| (yes/no, spam/not spam, disease/healthy) not numbers. Despite its name, it's a classification | |
| algorithm!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Binary classification (2 classes: 0 or 1)</li> | |
| <li>Uses sigmoid function to output probabilities</li> | |
| <li>Output is always between 0 and 1</li> | |
| <li>Uses log loss (cross-entropy) instead of MSE</li> | |
| </ul> | |
| </div> | |
| <h3>Why Not Linear Regression?</h3> | |
| <p>Imagine using linear regression (y = mx + c) for classification. The problems:</p> | |
| <ul> | |
| <li>Can predict values < 0 or > 1 (not valid probabilities!)</li> | |
| <li>Sensitive to outliers pulling the line</li> | |
| <li>No natural threshold for decision making</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ The Problem</div> | |
| <div class="callout-content"> | |
| Linear regression: ŷ = mx + c can give ANY value (-∞ to +∞)<br> | |
| Classification needs: probability between 0 and 1 | |
| </div> | |
| </div> | |
| <h3>Enter the Sigmoid Function</h3> | |
| <p>The sigmoid function σ(z) squashes any input into the range [0, 1], making it perfect for | |
| probabilities!</p> | |
| <div class="formula"> | |
| <strong>Sigmoid Function:</strong> | |
| σ(z) = 1 / (1 + e^(-z)) | |
| <br><small>where:<br>z = w·x + b (linear combination)<br>σ(z) = probability (always between 0 | |
| and 1)<br>e ≈ 2.718 (Euler's number)</small> | |
| </div> | |
| <h4>Sigmoid Properties:</h4> | |
| <ul> | |
| <li><strong>Input:</strong> Any real number (-∞ to +∞)</li> | |
| <li><strong>Output:</strong> Always between 0 and 1</li> | |
| <li><strong>Shape:</strong> S-shaped curve</li> | |
| <li><strong>At z=0:</strong> σ(0) = 0.5 (middle point)</li> | |
| <li><strong>As z→∞:</strong> σ(z) → 1</li> | |
| <li><strong>As z→-∞:</strong> σ(z) → 0</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="sigmoid-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Sigmoid function transforms linear input to | |
| probability</p> | |
| </div> | |
| <h3>Logistic Regression Formula</h3> | |
| <div class="formula"> | |
| <strong>Complete Process:</strong> | |
| 1. Linear combination: z = w·x + b<br> | |
| 2. Sigmoid transformation: p = σ(z) = 1/(1 + e^(-z))<br> | |
| 3. Decision: if p ≥ 0.5 → Class 1, else → Class 0 | |
| </div> | |
| <h3>Example: Height Classification</h3> | |
| <p>Let's classify people as "Tall" (1) or "Not Tall" (0) based on height:</p> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Height (cm)</th> | |
| <th>Label</th> | |
| <th>Probability</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>150</td> | |
| <td>0 (Not Tall)</td> | |
| <td>0.2</td> | |
| </tr> | |
| <tr> | |
| <td>160</td> | |
| <td>0</td> | |
| <td>0.35</td> | |
| </tr> | |
| <tr> | |
| <td>170</td> | |
| <td>0</td> | |
| <td>0.5</td> | |
| </tr> | |
| <tr> | |
| <td>180</td> | |
| <td>1 (Tall)</td> | |
| <td>0.65</td> | |
| </tr> | |
| <tr> | |
| <td>190</td> | |
| <td>1</td> | |
| <td>0.8</td> | |
| </tr> | |
| <tr> | |
| <td>200</td> | |
| <td>1</td> | |
| <td>0.9</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="logistic-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Logistic regression with decision boundary at | |
| 0.5</p> | |
| </div> | |
| <h3>Log Loss (Cross-Entropy)</h3> | |
| <p>We can't use MSE for logistic regression because it creates a non-convex optimization surface | |
| (multiple local minima). Instead, we use log loss:</p> | |
| <div class="formula"> | |
| <strong>Log Loss for Single Sample:</strong> | |
| L(y, p) = -[y·log(p) + (1-y)·log(1-p)] | |
| <br><small>where:<br>y = actual label (0 or 1)<br>p = predicted probability</small> | |
| </div> | |
| <h4>Understanding Log Loss:</h4> | |
| <p><strong>Case 1:</strong> Actual y=1, Predicted p=0.9</p> | |
| <p>Loss = -[1·log(0.9) + 0·log(0.1)] = -log(0.9) = 0.105 <span style="color: #7ef0d4;">✓ Low loss | |
| (good!)</span></p> | |
| <p><strong>Case 2:</strong> Actual y=1, Predicted p=0.1</p> | |
| <p>Loss = -[1·log(0.1) + 0·log(0.9)] = -log(0.1) = 2.303 <span style="color: #ff8c6a;">✗ High loss | |
| (bad!)</span></p> | |
| <p><strong>Case 3:</strong> Actual y=0, Predicted p=0.1</p> | |
| <p>Loss = -[0·log(0.1) + 1·log(0.9)] = -log(0.9) = 0.105 <span style="color: #7ef0d4;">✓ Low loss | |
| (good!)</span></p> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Why Log Loss Works</div> | |
| <div class="callout-content"> | |
| Log loss heavily penalizes confident wrong predictions! If you predict 0.99 but the answer | |
| is 0, you get a huge penalty. This encourages the model to be accurate AND calibrated. | |
| </div> | |
| </div> | |
| <h3>Training with Gradient Descent</h3> | |
| <p>Just like linear regression, we use gradient descent to optimize weights:</p> | |
| <div class="formula"> | |
| <strong>Gradient for Logistic Regression:</strong> | |
| ∂Loss/∂w = (p - y)·x<br> | |
| ∂Loss/∂b = (p - y) | |
| <br><small>Update: w = w - α·∂Loss/∂w</small> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaway</div> | |
| <div class="callout-content"> | |
| Logistic regression = Linear regression + Sigmoid function + Log loss. It's called | |
| "regression" for historical reasons, but it's actually for classification! | |
| </div> | |
| </div> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(126, 240, 212, 0.1), rgba(106, 169, 255, 0.1)); border: 2px solid #7ef0d4; margin-top: 32px;"> | |
| <h3 style="color: #7ef0d4; margin-bottom: 20px;">📐 Complete Mathematical Derivation: Logistic | |
| Regression</h3> | |
| <p style="color: #ff8c6a; font-weight: bold;">Let's walk through the entire process with real | |
| numbers!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Predict if a person is "Tall" based on | |
| height</strong><br><br> | |
| <strong>Training Data:</strong><br> | |
| Person 1: Height = 155 cm → Not Tall (y = 0)<br> | |
| Person 2: Height = 165 cm → Not Tall (y = 0)<br> | |
| Person 3: Height = 175 cm → Tall (y = 1)<br> | |
| Person 4: Height = 185 cm → Tall (y = 1)<br><br> | |
| <strong>Given trained weights:</strong> w = 0.05, b = -8.5 | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Calculate Linear Combination (z)</strong><br><br> | |
| <strong>Formula:</strong> z = w × height + b<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">Height (cm)</th> | |
| <th style="padding: 10px;">z = 0.05 × height - 8.5</th> | |
| <th style="padding: 10px;">z value</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">155</td> | |
| <td style="text-align: center;">0.05 × 155 - 8.5</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-0.75</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">165</td> | |
| <td style="text-align: center;">0.05 × 165 - 8.5</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-0.25</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">175</td> | |
| <td style="text-align: center;">0.05 × 175 - 8.5</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>+0.25</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">185</td> | |
| <td style="text-align: center;">0.05 × 185 - 8.5</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>+0.75</strong></td> | |
| </tr> | |
| </table> | |
| <em style="color: #a9b4c2;">Negative z → likely class 0, Positive z → likely class 1</em> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Apply Sigmoid Function σ(z)</strong><br><br> | |
| <strong>Sigmoid Formula:</strong> σ(z) = 1 / (1 + e⁻ᶻ)<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">z</th> | |
| <th style="padding: 10px;">e⁻ᶻ</th> | |
| <th style="padding: 10px;">1 + e⁻ᶻ</th> | |
| <th style="padding: 10px;">σ(z) = 1/(1+e⁻ᶻ)</th> | |
| <th style="padding: 10px;">Interpretation</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">-0.75</td> | |
| <td style="text-align: center;">e⁰·⁷⁵ = 2.117</td> | |
| <td style="text-align: center;">3.117</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>0.32</strong></td> | |
| <td style="text-align: center;">32% chance tall</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">-0.25</td> | |
| <td style="text-align: center;">e⁰·²⁵ = 1.284</td> | |
| <td style="text-align: center;">2.284</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>0.44</strong></td> | |
| <td style="text-align: center;">44% chance tall</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">+0.25</td> | |
| <td style="text-align: center;">e⁻⁰·²⁵ = 0.779</td> | |
| <td style="text-align: center;">1.779</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.56</strong></td> | |
| <td style="text-align: center;">56% chance tall</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">+0.75</td> | |
| <td style="text-align: center;">e⁻⁰·⁷⁵ = 0.472</td> | |
| <td style="text-align: center;">1.472</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.68</strong></td> | |
| <td style="text-align: center;">68% chance tall</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Make Predictions (threshold = 0.5)</strong><br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">Height</th> | |
| <th style="padding: 10px;">p = σ(z)</th> | |
| <th style="padding: 10px;">p ≥ 0.5?</th> | |
| <th style="padding: 10px;">Prediction</th> | |
| <th style="padding: 10px;">Actual</th> | |
| <th style="padding: 10px;">Correct?</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">155</td> | |
| <td style="text-align: center;">0.32</td> | |
| <td style="text-align: center;">No</td> | |
| <td style="text-align: center;">0 (Not Tall)</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">165</td> | |
| <td style="text-align: center;">0.44</td> | |
| <td style="text-align: center;">No</td> | |
| <td style="text-align: center;">0 (Not Tall)</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">175</td> | |
| <td style="text-align: center;">0.56</td> | |
| <td style="text-align: center;">Yes</td> | |
| <td style="text-align: center;">1 (Tall)</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">185</td> | |
| <td style="text-align: center;">0.68</td> | |
| <td style="text-align: center;">Yes</td> | |
| <td style="text-align: center;">1 (Tall)</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| </table> | |
| <strong style="color: #7ef0d4; font-size: 16px;">100% accuracy on training data!</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Calculate Log Loss (Cross-Entropy)</strong><br><br> | |
| <strong>Formula:</strong> L = -[y × log(p) + (1-y) × log(1-p)]<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">y (actual)</th> | |
| <th style="padding: 10px;">p (predicted)</th> | |
| <th style="padding: 10px;">Calculation</th> | |
| <th style="padding: 10px;">Loss</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">0</td> | |
| <td style="text-align: center;">0.32</td> | |
| <td style="text-align: center;">-[0×log(0.32) + 1×log(0.68)]</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.39</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">0</td> | |
| <td style="text-align: center;">0.44</td> | |
| <td style="text-align: center;">-[0×log(0.44) + 1×log(0.56)]</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.58</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">1</td> | |
| <td style="text-align: center;">0.56</td> | |
| <td style="text-align: center;">-[1×log(0.56) + 0×log(0.44)]</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.58</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">1</td> | |
| <td style="text-align: center;">0.68</td> | |
| <td style="text-align: center;">-[1×log(0.68) + 0×log(0.32)]</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.39</strong></td> | |
| </tr> | |
| <tr style="border-top: 2px solid #7ef0d4;"> | |
| <td colspan="3" style="padding: 8px;"><strong>Average Log Loss:</strong></td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>(0.39+0.58+0.58+0.39)/4 = | |
| 0.485</strong></td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Summary of Logistic Regression Math</div> | |
| <div class="callout-content"> | |
| <strong>The Complete Pipeline:</strong><br> | |
| 1. <strong>Linear:</strong> z = w×x + b (compute a score)<br> | |
| 2. <strong>Sigmoid:</strong> p = 1/(1+e⁻ᶻ) (convert score to probability 0-1)<br> | |
| 3. <strong>Threshold:</strong> if p ≥ 0.5, predict class 1; else predict class 0<br> | |
| 4. <strong>Loss:</strong> Log Loss = -[y×log(p) + (1-y)×log(1-p)]<br> | |
| 5. <strong>Train:</strong> Use gradient descent to minimize total log loss! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 5: Support Vector Machines (COMPREHENSIVE UPDATE) --> | |
| <div class="section" id="svm"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> Support Vector Machines (SVM)</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <!-- 1. Introduction --> | |
| <h3>What is SVM?</h3> | |
| <p>Support Vector Machine (SVM) is a powerful supervised machine learning algorithm used for both | |
| classification and regression tasks. Unlike logistic regression which just needs any line that | |
| separates the classes, SVM finds the BEST decision boundary - the one with the maximum margin | |
| between classes.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Finds the best decision boundary with maximum margin</li> | |
| <li>Support vectors are critical points that define the margin</li> | |
| <li>Score is proportional to distance from boundary</li> | |
| <li>Only support vectors matter - other points don't affect boundary</li> | |
| </ul> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Key Insight</div> | |
| <div class="callout-content"> | |
| SVM doesn't just want w·x + b > 0, it wants every point to be confidently far from the | |
| boundary. The score is directly proportional to the distance from the decision boundary! | |
| </div> | |
| </div> | |
| <!-- 2. Dataset and Example --> | |
| <h3>Dataset and Example</h3> | |
| <p>Let's work with a simple 2D dataset to understand SVM:</p> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Point</th> | |
| <th>X₁</th> | |
| <th>X₂</th> | |
| <th>Class</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>A</strong></td> | |
| <td>2</td> | |
| <td>7</td> | |
| <td>+1</td> | |
| </tr> | |
| <tr> | |
| <td><strong>B</strong></td> | |
| <td>3</td> | |
| <td>8</td> | |
| <td>+1</td> | |
| </tr> | |
| <tr> | |
| <td><strong>C</strong></td> | |
| <td>4</td> | |
| <td>7</td> | |
| <td>+1</td> | |
| </tr> | |
| <tr> | |
| <td><strong>D</strong></td> | |
| <td>6</td> | |
| <td>2</td> | |
| <td>-1</td> | |
| </tr> | |
| <tr> | |
| <td><strong>E</strong></td> | |
| <td>7</td> | |
| <td>3</td> | |
| <td>-1</td> | |
| </tr> | |
| <tr> | |
| <td><strong>F</strong></td> | |
| <td>8</td> | |
| <td>2</td> | |
| <td>-1</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Initial parameters:</strong> w₁ = 1, w₂ = 1, b = -10</p> | |
| <!-- 3. Decision Boundary --> | |
| <h3>Decision Boundary</h3> | |
| <p>The decision boundary is a line (or hyperplane in higher dimensions) that separates the two | |
| classes. It's defined by the equation:</p> | |
| <div class="formula"> | |
| <strong>Decision Boundary Equation:</strong> | |
| w·x + b = 0 | |
| <br><small>where:<br>w = [w₁, w₂] is the weight vector<br>x = [x₁, x₂] is the data point<br>b is | |
| the bias term</small> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-title">Interpretation</div> | |
| <ul class="info-card-list"> | |
| <li><strong>w·x + b > 0</strong> → point above line → class +1</li> | |
| <li><strong>w·x + b < 0</strong> → point below line → class -1</li> | |
| <li><strong>w·x + b = 0</strong> → exactly on boundary</li> | |
| </ul> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="svm-basic-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 3:</strong> SVM decision boundary with 6 data points. | |
| Hover to see scores.</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Adjust w₁: <span id="svm-w1-val">1.0</span></label> | |
| <input type="range" id="svm-w1-slider" min="-2" max="2" step="0.1" value="1"> | |
| </div> | |
| <div class="control-group"> | |
| <label>Adjust w₂: <span id="svm-w2-val">1.0</span></label> | |
| <input type="range" id="svm-w2-slider" min="-2" max="2" step="0.1" value="1"> | |
| </div> | |
| <div class="control-group"> | |
| <label>Adjust b: <span id="svm-b-val">-10</span></label> | |
| <input type="range" id="svm-b-slider" min="-15" max="5" step="0.5" value="-10"> | |
| </div> | |
| </div> | |
| <!-- 4. Margin and Support Vectors --> | |
| <h3>Margin and Support Vectors</h3> | |
| <div class="callout success"> | |
| <div class="callout-title">📏 Understanding Margin</div> | |
| <div class="callout-content"> | |
| The <strong>margin</strong> is the distance between the decision boundary and the closest | |
| points from each class. <strong>Support vectors</strong> are the points exactly at the | |
| margin (with score = ±1). These are the points with "lowest acceptable confidence" and | |
| they're the only ones that matter for defining the boundary! | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Margin Constraints:</strong> | |
| For positive points (yᵢ = +1): w·xᵢ + b ≥ +1<br> | |
| For negative points (yᵢ = -1): w·xᵢ + b ≤ -1<br> | |
| <br> | |
| <strong>Combined:</strong> yᵢ(w·xᵢ + b) ≥ 1<br> | |
| <br> | |
| <strong>Margin Width:</strong> 2/||w|| | |
| <br><small>To maximize margin → minimize ||w||</small> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="svm-margin-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 4:</strong> Decision boundary with margin lines and | |
| support vectors highlighted in cyan</p> | |
| </div> | |
| <!-- 5. Hard Margin vs Soft Margin --> | |
| <h3>Hard Margin vs Soft Margin</h3> | |
| <h4>Hard Margin SVM</h4> | |
| <p>Hard margin SVM requires perfect separation - no points can violate the margin. It works only | |
| when data is linearly separable.</p> | |
| <div class="formula"> | |
| <strong>Hard Margin Optimization:</strong> | |
| minimize (1/2)||w||²<br> | |
| subject to: yᵢ(w·xᵢ + b) ≥ 1 for all i | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Hard Margin Limitation</div> | |
| <div class="callout-content"> | |
| Hard margin can lead to overfitting if we force perfect separation on noisy data! Real-world | |
| data often has outliers and noise. | |
| </div> | |
| </div> | |
| <h4>Soft Margin SVM</h4> | |
| <p>Soft margin SVM allows some margin violations, making it more practical for real-world data. It | |
| balances margin maximization with allowing some misclassifications.</p> | |
| <div class="formula"> | |
| <strong>Soft Margin Cost Function:</strong> | |
| Cost = (1/2)||w||² + C·Σ max(0, 1 - yᵢ(w·xᵢ + b))<br> | |
| ↓ ↓<br> | |
| Maximize margin Hinge Loss<br> | |
| (penalize | |
| violations) | |
| </div> | |
| <!-- 6. The C Parameter --> | |
| <h3>The C Parameter</h3> | |
| <p>The C parameter controls the trade-off between maximizing the margin and minimizing | |
| classification errors. It acts like regularization in other ML algorithms.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Effects of C Parameter</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Small C (0.1 or 1):</strong> Wider margin, more violations allowed, better | |
| generalization, use when data is noisy</li> | |
| <li><strong>Large C (1000):</strong> Narrower margin, fewer violations, classify everything | |
| correctly, risk of overfitting, use when data is clean</li> | |
| </ul> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="svm-c-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 5:</strong> Effect of C parameter on margin and | |
| violations</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>C Parameter: <span id="svm-c-val">1</span></label> | |
| <input type="range" id="svm-c-slider" min="-1" max="3" step="0.1" value="0"> | |
| <p style="font-size: 12px; color: #7ef0d4; margin-top: 8px;">Slide to see: 0.1 → 1 → 10 → | |
| 1000</p> | |
| </div> | |
| <div style="display: flex; gap: 16px; margin-top: 12px;"> | |
| <div | |
| style="flex: 1; padding: 12px; background: rgba(106, 169, 255, 0.1); border-radius: 8px;"> | |
| <div style="font-size: 12px; color: #a9b4c2;">Margin Width</div> | |
| <div style="font-size: 20px; color: #6aa9ff; font-weight: 600;" id="margin-width">2.00 | |
| </div> | |
| </div> | |
| <div | |
| style="flex: 1; padding: 12px; background: rgba(255, 140, 106, 0.1); border-radius: 8px;"> | |
| <div style="font-size: 12px; color: #a9b4c2;">Violations</div> | |
| <div style="font-size: 20px; color: #ff8c6a; font-weight: 600;" id="violations-count">0 | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- 7. Training Algorithm --> | |
| <h3>Training Algorithm</h3> | |
| <p>SVM can be trained using gradient descent. For each training sample (xᵢ, yᵢ), we check if it | |
| violates the margin and update weights accordingly.</p> | |
| <div class="formula"> | |
| <strong>Update Rules:</strong><br> | |
| <br> | |
| <strong>Case 1: No violation</strong> (yᵢ(w·xᵢ + b) ≥ 1)<br> | |
| w = w - η·w (just regularization)<br> | |
| b = b<br> | |
| <br> | |
| <strong>Case 2: Violation</strong> (yᵢ(w·xᵢ + b) < 1)<br> | |
| w = w - η(w - C·yᵢ·xᵢ)<br> | |
| b = b + η·C·yᵢ<br> | |
| <br> | |
| <small>where η = learning rate (e.g., 0.01)</small> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="svm-train-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 6:</strong> SVM training visualization - step through | |
| each point</p> | |
| </div> | |
| <div class="controls"> | |
| <div style="display: flex; gap: 12px; margin-bottom: 16px;"> | |
| <button class="btn btn-primary" id="svm-train-btn">Start Training</button> | |
| <button class="btn btn-secondary" id="svm-step-btn">Next Step</button> | |
| <button class="btn btn-secondary" id="svm-reset-btn">Reset</button> | |
| </div> | |
| <div id="svm-train-info" | |
| style="padding: 16px; background: #2a3544; border-radius: 8px; font-family: monospace; font-size: 14px;"> | |
| <div>Step: <span id="train-step">0</span> / 6</div> | |
| <div>Current Point: <span id="train-point">-</span></div> | |
| <div>w = [<span id="train-w">0.00, 0.00</span>]</div> | |
| <div>b = <span id="train-b">0.00</span></div> | |
| <div>Violation: <span id="train-violation" style="color: #7ef0d4;">-</span></div> | |
| </div> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">📝 Example Calculation (Point A)</div> | |
| <div class="callout-content"> | |
| <strong>A = (2, 7), y = +1</strong><br><br> | |
| Check: y(w·x + b) = 1(0 + 0 + 0) = 0 < 1 ❌ Violation!<br><br> | |
| Update:<br> | |
| w<sub>new</sub> = [0, 0] - 0.01(0 - 1·1·[2, 7])<br> | |
| = [0.02, 0.07]<br><br> | |
| b<sub>new</sub> = 0 + 0.01·1·1 = 0.01 | |
| </div> | |
| </div> | |
| <!-- 8. SVM Kernels --> | |
| <h3>SVM Kernels (Advanced)</h3> | |
| <p>Real-world data is often not linearly separable. Kernels transform data to higher dimensions | |
| where a linear boundary exists, which appears non-linear in the original space!</p> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 The Kernel Trick</div> | |
| <div class="callout-content"> | |
| Kernels let us solve non-linear problems without explicitly computing high-dimensional | |
| features! They compute similarity between points in transformed space efficiently. | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Three Main Kernels:</strong><br> | |
| <br> | |
| <strong>1. Linear Kernel</strong><br> | |
| K(x₁, x₂) = x₁·x₂<br> | |
| Use case: Linearly separable data<br> | |
| <br> | |
| <strong>2. Polynomial Kernel (degree 2)</strong><br> | |
| K(x₁, x₂) = (x₁·x₂ + 1)²<br> | |
| Use case: Curved boundaries, circular patterns<br> | |
| <br> | |
| <strong>3. RBF / Gaussian Kernel</strong><br> | |
| K(x₁, x₂) = e^(-γ||x₁-x₂||²)<br> | |
| Use case: Complex non-linear patterns<br> | |
| Most popular in practice! | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 500px"> | |
| <canvas id="svm-kernel-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 7:</strong> Kernel comparison on non-linear data</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Select Kernel:</label> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="kernel" value="linear" checked> Linear</label> | |
| <label><input type="radio" name="kernel" value="polynomial"> Polynomial</label> | |
| <label><input type="radio" name="kernel" value="rbf"> RBF</label> | |
| </div> | |
| </div> | |
| <div class="control-group" id="kernel-param-group" style="display: none;"> | |
| <label>Kernel Parameter (γ or degree): <span id="kernel-param-val">1</span></label> | |
| <input type="range" id="kernel-param-slider" min="0.1" max="5" step="0.1" value="1"> | |
| </div> | |
| </div> | |
| <!-- 9. Key Formulas Summary --> | |
| <h3>Key Formulas Summary</h3> | |
| <div class="formula"> | |
| <strong>Essential SVM Formulas:</strong><br> | |
| <br> | |
| 1. Decision Boundary: w·x + b = 0<br> | |
| <br> | |
| 2. Classification Rule: sign(w·x + b)<br> | |
| <br> | |
| 3. Margin Width: 2/||w||<br> | |
| <br> | |
| 4. Hard Margin Optimization:<br> | |
| minimize (1/2)||w||²<br> | |
| subject to yᵢ(w·xᵢ + b) ≥ 1<br> | |
| <br> | |
| 5. Soft Margin Cost:<br> | |
| (1/2)||w||² + C·Σ max(0, 1 - yᵢ(w·xᵢ + b))<br> | |
| <br> | |
| 6. Hinge Loss: max(0, 1 - yᵢ(w·xᵢ + b))<br> | |
| <br> | |
| 7. Update Rules (if violation):<br> | |
| w = w - η(w - C·yᵢ·xᵢ)<br> | |
| b = b + η·C·yᵢ<br> | |
| <br> | |
| 8. Kernel Functions:<br> | |
| Linear: K(x₁, x₂) = x₁·x₂<br> | |
| Polynomial: K(x₁, x₂) = (x₁·x₂ + 1)^d<br> | |
| RBF: K(x₁, x₂) = e^(-γ||x₁-x₂||²) | |
| </div> | |
| <!-- 10. Practical Insights --> | |
| <h3>Practical Insights</h3> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Why SVM is Powerful</div> | |
| <div class="callout-content"> | |
| SVM only cares about support vectors - the points closest to the boundary. Other points | |
| don't affect the decision boundary at all! This makes it memory efficient and robust. | |
| </div> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-title">When to Use SVM</div> | |
| <ul class="info-card-list"> | |
| <li>Small to medium datasets (works great up to ~10,000 samples)</li> | |
| <li>High-dimensional data (even more features than samples!)</li> | |
| <li>Clear margin of separation exists between classes</li> | |
| <li>Need interpretable decision boundary</li> | |
| </ul> | |
| </div> | |
| <h4>Advantages</h4> | |
| <ul> | |
| <li><strong>Effective in high dimensions:</strong> Works well even when features > samples | |
| </li> | |
| <li><strong>Memory efficient:</strong> Only stores support vectors, not entire dataset</li> | |
| <li><strong>Versatile:</strong> Different kernels for different data patterns</li> | |
| <li><strong>Robust:</strong> Works well with clear margin of separation</li> | |
| </ul> | |
| <h4>Disadvantages</h4> | |
| <ul> | |
| <li><strong>Slow on large datasets:</strong> Training time grows quickly with >10k samples | |
| </li> | |
| <li><strong>No probability estimates:</strong> Doesn't directly provide confidence scores</li> | |
| <li><strong>Kernel choice:</strong> Requires expertise to select right kernel</li> | |
| <li><strong>Feature scaling:</strong> Very sensitive to feature scales</li> | |
| </ul> | |
| <!-- 11. Real-World Example --> | |
| <h3>Real-World Example: Email Spam Classification</h3> | |
| <div class="info-card"> | |
| <div class="info-card-title">📧 Email Spam Detection</div> | |
| <p style="margin: 12px 0; line-height: 1.6;">Imagine we have emails with two features:</p> | |
| <ul class="info-card-list"> | |
| <li>x₁ = number of promotional words ("free", "buy", "limited")</li> | |
| <li>x₂ = number of capital letters</li> | |
| </ul> | |
| <p style="margin: 12px 0; line-height: 1.6;"> | |
| SVM finds the widest "road" between spam and non-spam emails. Support vectors are the emails | |
| closest to this road - they're the trickiest cases that define our boundary! An email far | |
| from the boundary is clearly spam or clearly legitimate. | |
| </p> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.svm <span style="color: #ff8c6a;">import</span> SVC | |
| <span style="color: #ff8c6a;">from</span> sklearn.preprocessing <span style="color: #ff8c6a;">import</span> StandardScaler | |
| <span style="color: #ff8c6a;">from</span> sklearn.model_selection <span style="color: #ff8c6a;">import</span> train_test_split | |
| <span style="color: #6aa9ff;"># Scale features (very important for SVM!)</span> | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| <span style="color: #6aa9ff;"># Create SVM with RBF kernel</span> | |
| svm = SVC( | |
| kernel=<span style="color: #7ef0d4;">'rbf'</span>, <span style="color: #6aa9ff;"># Options: 'linear', 'poly', 'rbf'</span> | |
| C=<span style="color: #7ef0d4;">1.0</span>, <span style="color: #6aa9ff;"># Regularization parameter</span> | |
| gamma=<span style="color: #7ef0d4;">'scale'</span> <span style="color: #6aa9ff;"># Kernel coefficient</span> | |
| ) | |
| <span style="color: #6aa9ff;"># Train</span> | |
| svm.fit(X_train_scaled, y_train) | |
| <span style="color: #6aa9ff;"># Predict</span> | |
| predictions = svm.predict(X_test_scaled) | |
| <span style="color: #6aa9ff;"># Get support vectors</span> | |
| <span style="color: #ff8c6a;">print</span>(f<span style="color: #7ef0d4;">"Number of support vectors: {len(svm.support_vectors_)}"</span>)</pre> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">🎯 Key Takeaway</div> | |
| <div class="callout-content"> | |
| Unlike other algorithms that try to classify all points correctly, SVM focuses on the | |
| decision boundary. It asks: "What's the safest road I can build between these two groups?" | |
| The answer: Make it as wide as possible! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 6: K-Nearest Neighbors --> | |
| <div class="section" id="knn"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> K-Nearest Neighbors (KNN)</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>K-Nearest Neighbors is the simplest machine learning algorithm! To classify a new point, just | |
| look at its K nearest neighbors and take a majority vote. No training required!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Lazy learning: No training phase, just memorize data</li> | |
| <li>K = number of neighbors to consider</li> | |
| <li>Uses distance metrics (Euclidean, Manhattan)</li> | |
| <li>Classification: majority vote | Regression: average</li> | |
| </ul> | |
| </div> | |
| <h3>How KNN Works</h3> | |
| <ol> | |
| <li><strong>Choose K:</strong> Decide how many neighbors (e.g., K=3)</li> | |
| <li><strong>Calculate distance:</strong> Find distance from new point to all training points | |
| </li> | |
| <li><strong>Find K nearest:</strong> Select K points with smallest distances</li> | |
| <li><strong>Vote:</strong> Majority class wins (or take average for regression)</li> | |
| </ol> | |
| <h3>Distance Metrics</h3> | |
| <div class="formula"> | |
| <strong>Euclidean Distance (straight line):</strong> | |
| d = √[(x₁-x₂)² + (y₁-y₂)²] | |
| <br><small>Like measuring with a ruler - shortest path</small> | |
| </div> | |
| <div class="formula"> | |
| <strong>Manhattan Distance (city blocks):</strong> | |
| d = |x₁-x₂| + |y₁-y₂| | |
| <br><small>Like walking on city grid - only horizontal/vertical</small> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="knn-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> KNN classification - drag the test point to | |
| see predictions</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>K Value: <span id="knn-k-val">3</span></label> | |
| <input type="range" id="knn-k-slider" min="1" max="7" step="2" value="3"> | |
| </div> | |
| <div class="control-group"> | |
| <label>Distance Metric:</label> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="knn-distance" value="euclidean" checked> | |
| Euclidean</label> | |
| <label><input type="radio" name="knn-distance" value="manhattan"> Manhattan</label> | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Worked Example</h3> | |
| <p><strong>Test point at (2.5, 2.5), K=3:</strong></p> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Point</th> | |
| <th>Position</th> | |
| <th>Class</th> | |
| <th>Distance</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>A</td> | |
| <td>(1.0, 2.0)</td> | |
| <td>Orange</td> | |
| <td>1.80</td> | |
| </tr> | |
| <tr> | |
| <td>B</td> | |
| <td>(0.9, 1.7)</td> | |
| <td>Orange</td> | |
| <td>2.00</td> | |
| </tr> | |
| <tr style="background: rgba(126, 240, 212, 0.1);"> | |
| <td><strong>C</strong></td> | |
| <td>(1.5, 2.5)</td> | |
| <td>Orange</td> | |
| <td><strong>1.00 ← nearest!</strong></td> | |
| </tr> | |
| <tr> | |
| <td>D</td> | |
| <td>(4.0, 5.0)</td> | |
| <td>Yellow</td> | |
| <td>3.35</td> | |
| </tr> | |
| <tr> | |
| <td>E</td> | |
| <td>(4.2, 4.8)</td> | |
| <td>Yellow</td> | |
| <td>3.15</td> | |
| </tr> | |
| <tr> | |
| <td>F</td> | |
| <td>(3.8, 5.2)</td> | |
| <td>Yellow</td> | |
| <td>3.12</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>3-Nearest Neighbors:</strong> C (orange), A (orange), B (orange)</p> | |
| <p><strong>Vote:</strong> 3 orange, 0 yellow → <strong>Prediction: Orange</strong> 🟠</p> | |
| <h3>Choosing K</h3> | |
| <ul> | |
| <li><strong>K=1:</strong> Very sensitive to noise, overfits</li> | |
| <li><strong>Small K (3,5):</strong> Flexible boundaries, can capture local patterns</li> | |
| <li><strong>Large K (>10):</strong> Smoother boundaries, more stable but might underfit</li> | |
| <li><strong>Odd K:</strong> Avoids ties in binary classification</li> | |
| <li><strong>Rule of thumb:</strong> K = √n (where n = number of training samples)</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Critical: Feature Scaling!</div> | |
| <div class="callout-content"> | |
| Always scale features before using KNN! If one feature has range [0, 1000] and another [0, | |
| 1], the large feature dominates distance calculations. Use StandardScaler or MinMaxScaler. | |
| </div> | |
| </div> | |
| <h3>Advantages</h3> | |
| <ul> | |
| <li>✓ Simple to understand and implement</li> | |
| <li>✓ No training time (just stores data)</li> | |
| <li>✓ Works with any number of classes</li> | |
| <li>✓ Can learn complex decision boundaries</li> | |
| <li>✓ Naturally handles multi-class problems</li> | |
| </ul> | |
| <h3>Disadvantages</h3> | |
| <ul> | |
| <li>✗ Slow prediction (compares to ALL training points)</li> | |
| <li>✗ High memory usage (stores entire dataset)</li> | |
| <li>✗ Sensitive to feature scaling</li> | |
| <li>✗ Curse of dimensionality (struggles with many features)</li> | |
| <li>✗ Sensitive to irrelevant features</li> | |
| </ul> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 When to Use KNN</div> | |
| <div class="callout-content"> | |
| KNN works best with small to medium datasets (<10,000 samples) with few features | |
| (<20). Great for recommendation systems, pattern recognition, and as a baseline to | |
| compare other models! | |
| </div> | |
| </div> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(255, 180, 144, 0.1), rgba(106, 169, 255, 0.1)); border: 2px solid #ffb490; margin-top: 32px;"> | |
| <h3 style="color: #ffb490; margin-bottom: 20px;">📐 Complete Mathematical Derivation: KNN | |
| Classification</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's classify a new point step-by-step with | |
| actual calculations!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Classify a new fruit</strong><br><br> | |
| <strong>Training Data:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Fruit</th> | |
| <th style="padding: 8px;">Weight (g)</th> | |
| <th style="padding: 8px;">Size (cm)</th> | |
| <th style="padding: 8px;">Class</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">A</td> | |
| <td>140</td> | |
| <td>7</td> | |
| <td style="color: #ff8c6a;">Apple</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px;">B</td> | |
| <td>150</td> | |
| <td>7.5</td> | |
| <td style="color: #ff8c6a;">Apple</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">C</td> | |
| <td>180</td> | |
| <td>9</td> | |
| <td style="color: #7ef0d4;">Orange</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px;">D</td> | |
| <td>200</td> | |
| <td>10</td> | |
| <td style="color: #7ef0d4;">Orange</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">E</td> | |
| <td>160</td> | |
| <td>8</td> | |
| <td style="color: #7ef0d4;">Orange</td> | |
| </tr> | |
| </table> | |
| <strong>New point to classify:</strong> Weight = 165g, Size = 8.5cm<br> | |
| <strong>Using K = 3</strong> (3 nearest neighbors) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Calculate Euclidean Distance to ALL | |
| Points</strong><br><br> | |
| <strong>Distance Formula:</strong> d = √[(x₂-x₁)² + (y₂-y₁)²]<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Point</th> | |
| <th style="padding: 8px;">Calculation</th> | |
| <th style="padding: 8px;">Distance</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;"><strong>A</strong></td> | |
| <td>√[(165-140)² + (8.5-7)²] = √[625 + 2.25]</td> | |
| <td style="color: #ff8c6a;"><strong>25.04</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px;"><strong>B</strong></td> | |
| <td>√[(165-150)² + (8.5-7.5)²] = √[225 + 1]</td> | |
| <td style="color: #ff8c6a;"><strong>15.03</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;"><strong>C</strong></td> | |
| <td>√[(165-180)² + (8.5-9)²] = √[225 + 0.25]</td> | |
| <td style="color: #7ef0d4;"><strong>15.01</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px;"><strong>D</strong></td> | |
| <td>√[(165-200)² + (8.5-10)²] = √[1225 + 2.25]</td> | |
| <td style="color: #ff8c6a;"><strong>35.03</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;"><strong>E</strong></td> | |
| <td>√[(165-160)² + (8.5-8)²] = √[25 + 0.25]</td> | |
| <td style="color: #7ef0d4;"><strong>5.02</strong></td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Find K=3 Nearest Neighbors</strong><br><br> | |
| <strong>Sort by distance:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Rank</th> | |
| <th style="padding: 8px;">Point</th> | |
| <th style="padding: 8px;">Distance</th> | |
| <th style="padding: 8px;">Class</th> | |
| <th style="padding: 8px;">Include?</th> | |
| </tr> | |
| <tr style="background: rgba(126, 240, 212, 0.1);"> | |
| <td style="padding: 6px;">1st</td> | |
| <td><strong>E</strong></td> | |
| <td>5.02</td> | |
| <td style="color: #7ef0d4;">Orange</td> | |
| <td style="color: #7ef0d4;"><strong>✓ Yes</strong></td> | |
| </tr> | |
| <tr style="background: rgba(126, 240, 212, 0.1);"> | |
| <td style="padding: 6px;">2nd</td> | |
| <td><strong>C</strong></td> | |
| <td>15.01</td> | |
| <td style="color: #7ef0d4;">Orange</td> | |
| <td style="color: #7ef0d4;"><strong>✓ Yes</strong></td> | |
| </tr> | |
| <tr style="background: rgba(126, 240, 212, 0.1);"> | |
| <td style="padding: 6px;">3rd</td> | |
| <td><strong>B</strong></td> | |
| <td>15.03</td> | |
| <td style="color: #ff8c6a;">Apple</td> | |
| <td style="color: #7ef0d4;"><strong>✓ Yes</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px;">4th</td> | |
| <td>A</td> | |
| <td>25.04</td> | |
| <td>Apple</td> | |
| <td style="color: #ff8c6a;">✗ No</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px;">5th</td> | |
| <td>D</td> | |
| <td>35.03</td> | |
| <td>Orange</td> | |
| <td style="color: #ff8c6a;">✗ No</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Vote Among K Neighbors</strong><br><br> | |
| <strong>K=3 Neighbors:</strong><br> | |
| • E: <span style="color: #7ef0d4;">Orange</span> (1 vote)<br> | |
| • C: <span style="color: #7ef0d4;">Orange</span> (1 vote)<br> | |
| • B: <span style="color: #ff8c6a;">Apple</span> (1 vote)<br><br> | |
| <strong>Final Vote Count:</strong><br> | |
| • Orange: <span style="color: #7ef0d4;"><strong>2 votes</strong></span><br> | |
| • Apple: <span style="color: #ff8c6a;">1 vote</span><br><br> | |
| <strong style="color: #7ef0d4; font-size: 20px;">🍊 Prediction: ORANGE (majority | |
| wins!)</strong> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ KNN Math Summary</div> | |
| <div class="callout-content"> | |
| <strong>The KNN Algorithm:</strong><br> | |
| 1. <strong>Calculate distance</strong> from new point to ALL training points<br> | |
| 2. <strong>Sort</strong> distances from smallest to largest<br> | |
| 3. <strong>Pick K</strong> nearest neighbors<br> | |
| 4. <strong>Vote:</strong> Classification = majority class, Regression = average | |
| value<br><br> | |
| <em>Note: Always normalize features first! Weight (100s) would dominate Size (10s) | |
| otherwise!</em> | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.neighbors <span style="color: #ff8c6a;">import</span> KNeighborsClassifier | |
| <span style="color: #ff8c6a;">from</span> sklearn.preprocessing <span style="color: #ff8c6a;">import</span> StandardScaler | |
| <span style="color: #6aa9ff;"># Scale features (essential for KNN!)</span> | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| <span style="color: #6aa9ff;"># Create KNN classifier</span> | |
| knn = KNeighborsClassifier( | |
| n_neighbors=<span style="color: #7ef0d4;">5</span>, <span style="color: #6aa9ff;"># Number of neighbors (K)</span> | |
| metric=<span style="color: #7ef0d4;">'euclidean'</span>, <span style="color: #6aa9ff;"># Distance metric</span> | |
| weights=<span style="color: #7ef0d4;">'uniform'</span> <span style="color: #6aa9ff;"># 'uniform' or 'distance'</span> | |
| ) | |
| <span style="color: #6aa9ff;"># Train (just stores the data!)</span> | |
| knn.fit(X_train_scaled, y_train) | |
| <span style="color: #6aa9ff;"># Predict</span> | |
| predictions = knn.predict(X_test_scaled) | |
| <span style="color: #6aa9ff;"># Get probabilities</span> | |
| probas = knn.predict_proba(X_test_scaled)</pre> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="model-evaluation"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Evaluation</span> Model Evaluation</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>How do we know if our model is good? Model evaluation provides metrics to measure performance and | |
| identify problems!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Metrics</div> | |
| <ul class="info-card-list"> | |
| <li>Confusion Matrix: Shows all prediction outcomes</li> | |
| <li>Accuracy, Precision, Recall, F1-Score</li> | |
| <li>ROC Curve & AUC: Performance across thresholds</li> | |
| <li>R² Score: For regression problems</li> | |
| </ul> | |
| </div> | |
| <h3>Confusion Matrix</h3> | |
| <p>The confusion matrix shows all possible outcomes of binary classification:</p> | |
| <div class="formula"> | |
| <strong>Confusion Matrix Structure:</strong> | |
| <pre style="background: none; border: none; padding: 0;"> | |
| Predicted | |
| Pos Neg | |
| Actual Pos TP FN | |
| Neg FP TN</pre> | |
| </div> | |
| <h4>Definitions:</h4> | |
| <ul> | |
| <li><strong>True Positive (TP):</strong> Correctly predicted positive</li> | |
| <li><strong>True Negative (TN):</strong> Correctly predicted negative</li> | |
| <li><strong>False Positive (FP):</strong> Wrongly predicted positive (Type I error)</li> | |
| <li><strong>False Negative (FN):</strong> Wrongly predicted negative (Type II error)</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 300px"> | |
| <canvas id="confusion-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Confusion matrix for spam detection (TP=600, | |
| FP=100, FN=300, TN=900)</p> | |
| </div> | |
| <h3>Classification Metrics</h3> | |
| <div class="formula"> | |
| <strong>Accuracy:</strong> | |
| Accuracy = (TP + TN) / (TP + TN + FP + FN) | |
| <br><small>Percentage of correct predictions overall</small> | |
| </div> | |
| <p><strong>Example:</strong> (600 + 900) / (600 + 900 + 100 + 300) = 1500/1900 = <strong>0.789 | |
| (78.9%)</strong></p> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Accuracy Paradox</div> | |
| <div class="callout-content"> | |
| Accuracy misleads on imbalanced data! If 99% emails are not spam, a model that always | |
| predicts "not spam" gets 99% accuracy but is useless! | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Precision:</strong> | |
| Precision = TP / (TP + FP) | |
| <br><small>"Of all predicted positives, how many are actually positive?"</small> | |
| </div> | |
| <p><strong>Example:</strong> 600 / (600 + 100) = 600/700 = <strong>0.857 (85.7%)</strong></p> | |
| <p><strong>Use when:</strong> False positives are costly (e.g., spam filter - don't want to block | |
| legitimate emails)</p> | |
| <div class="formula"> | |
| <strong>Recall (Sensitivity, TPR):</strong> | |
| Recall = TP / (TP + FN) | |
| <br><small>"Of all actual positives, how many did we catch?"</small> | |
| </div> | |
| <p><strong>Example:</strong> 600 / (600 + 300) = 600/900 = <strong>0.667 (66.7%)</strong></p> | |
| <p><strong>Use when:</strong> False negatives are costly (e.g., disease detection - can't miss sick | |
| patients)</p> | |
| <div class="formula"> | |
| <strong>F1-Score:</strong> | |
| F1 = 2 × (Precision × Recall) / (Precision + Recall) | |
| <br><small>Harmonic mean - balances precision and recall</small> | |
| </div> | |
| <p><strong>Example:</strong> 2 × (0.857 × 0.667) / (0.857 + 0.667) = <strong>0.750 (75.0%)</strong> | |
| </p> | |
| <h3>ROC Curve & AUC</h3> | |
| <p>The ROC (Receiver Operating Characteristic) curve shows model performance across ALL possible | |
| thresholds!</p> | |
| <div class="formula"> | |
| <strong>ROC Components:</strong> | |
| TPR (True Positive Rate) = TP / (TP + FN) = Recall<br> | |
| FPR (False Positive Rate) = FP / (FP + TN) | |
| <br><small>Plot: FPR (x-axis) vs TPR (y-axis)</small> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="roc-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> ROC curve - slide threshold to see trade-off | |
| </p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Classification Threshold: <span id="roc-threshold-val">0.5</span></label> | |
| <input type="range" id="roc-threshold-slider" min="0" max="1" step="0.1" value="0.5"> | |
| </div> | |
| </div> | |
| <h4>Understanding ROC:</h4> | |
| <ul> | |
| <li><strong>Top-left corner (0, 1):</strong> Perfect classifier</li> | |
| <li><strong>Diagonal line:</strong> Random guessing</li> | |
| <li><strong>Above diagonal:</strong> Better than random</li> | |
| <li><strong>Below diagonal:</strong> Worse than random (invert predictions!)</li> | |
| </ul> | |
| <div class="formula"> | |
| <strong>AUC (Area Under Curve):</strong> | |
| AUC = Area under ROC curve | |
| <br><small>AUC = 1.0: Perfect | AUC = 0.5: Random | AUC > 0.8: Good</small> | |
| </div> | |
| <h3>Regression Metrics: R² Score</h3> | |
| <p>For regression problems, R² (coefficient of determination) measures how well the model explains | |
| variance:</p> | |
| <div class="formula"> | |
| <strong>R² Formula:</strong> | |
| R² = 1 - (SS_res / SS_tot)<br> | |
| <br> | |
| SS_res = Σ(y - ŷ)² (sum of squared residuals)<br> | |
| SS_tot = Σ(y - ȳ)² (total sum of squares)<br> | |
| <br><small>ȳ = mean of actual values</small> | |
| </div> | |
| <h4>Interpreting R²:</h4> | |
| <ul> | |
| <li><strong>R² = 1.0:</strong> Perfect fit (model explains 100% of variance)</li> | |
| <li><strong>R² = 0.7:</strong> Model explains 70% of variance (pretty good!)</li> | |
| <li><strong>R² = 0.0:</strong> Model no better than just using the mean</li> | |
| <li><strong>R² < 0:</strong> Model worse than mean (something's very wrong!)</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="r2-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> R² calculation on height-weight regression | |
| </p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Choosing the Right Metric</div> | |
| <div class="callout-content"> | |
| <strong>Balanced data:</strong> Use accuracy<br> | |
| <strong>Imbalanced data:</strong> Use F1-score, precision, or recall<br> | |
| <strong>Medical diagnosis:</strong> Prioritize recall (catch all diseases)<br> | |
| <strong>Spam filter:</strong> Prioritize precision (don't block legitimate emails)<br> | |
| <strong>Regression:</strong> Use R², RMSE, or MAE | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="regularization"> | |
| <div class="section-header"> | |
| <h2>8. Regularization</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Regularization prevents overfitting by penalizing complex models. It adds a "simplicity | |
| constraint" to force the model to generalize better!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Prevents overfitting by penalizing large coefficients</li> | |
| <li>L1 (Lasso): Drives coefficients to zero, feature selection</li> | |
| <li>L2 (Ridge): Shrinks coefficients proportionally</li> | |
| <li>λ controls penalty strength</li> | |
| </ul> | |
| </div> | |
| <h3>The Overfitting Problem</h3> | |
| <p>Without regularization, models can learn training data TOO well:</p> | |
| <ul> | |
| <li>Captures noise instead of patterns</li> | |
| <li>High training accuracy, poor test accuracy</li> | |
| <li>Large coefficient values</li> | |
| <li>Model too complex for the problem</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Overfitting Example</div> | |
| <div class="callout-content"> | |
| Imagine fitting a 10th-degree polynomial to 12 data points. It perfectly fits training data | |
| (even noise) but fails on new data. Regularization prevents this! | |
| </div> | |
| </div> | |
| <h3>The Regularization Solution</h3> | |
| <p>Instead of minimizing just the loss, we minimize: <strong>Loss + Penalty</strong></p> | |
| <div class="formula"> | |
| <strong>Regularized Cost Function:</strong> | |
| Cost = Loss + λ × Penalty(θ) | |
| <br><small>where:<br>θ = model parameters (weights)<br>λ = regularization strength<br>Penalty = | |
| function of parameter magnitudes</small> | |
| </div> | |
| <h3>L1 Regularization (Lasso)</h3> | |
| <div class="formula"> | |
| <strong>L1 Penalty:</strong> | |
| Cost = MSE + λ × Σ|θᵢ| | |
| <br><small>Sum of absolute values of coefficients</small> | |
| </div> | |
| <h4>L1 Effects:</h4> | |
| <ul> | |
| <li><strong>Feature selection:</strong> Drives coefficients to exactly 0</li> | |
| <li><strong>Sparse models:</strong> Only important features remain</li> | |
| <li><strong>Interpretable:</strong> Easy to see which features matter</li> | |
| <li><strong>Use when:</strong> Many features, few are important</li> | |
| </ul> | |
| <!-- COMPREHENSIVE L1 MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(106, 169, 255, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #6aa9ff; margin-top: 32px;"> | |
| <h3 style="color: #6aa9ff; margin-bottom: 20px;">📐 L1 Regularization: Complete Mathematical | |
| Walkthrough</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's see how L1 drives coefficients to ZERO!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Predicting House Price with 4 | |
| Features</strong><br><br> | |
| <strong>Dataset:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Size (x₁)</th> | |
| <th style="padding: 8px;">Bedrooms (x₂)</th> | |
| <th style="padding: 8px;">Pool (x₃)</th> | |
| <th style="padding: 8px;">Age (x₄)</th> | |
| <th style="padding: 8px;">Price (y)</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">1000</td> | |
| <td style="text-align: center;">2</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">15</td> | |
| <td style="text-align: center;">₹50L</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">1500</td> | |
| <td style="text-align: center;">3</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">5</td> | |
| <td style="text-align: center;">₹75L</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">800</td> | |
| <td style="text-align: center;">2</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">20</td> | |
| <td style="text-align: center;">₹40L</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Linear Regression WITHOUT | |
| Regularization</strong><br><br> | |
| <strong>Model:</strong> ŷ = θ₀ + θ₁·Size + θ₂·Bedrooms + θ₃·Pool + θ₄·Age<br><br> | |
| <strong>Training Result (overfitted):</strong><br> | |
| θ₀ = 5.0<br> | |
| θ₁ = <strong style="color: #7ef0d4;">0.035</strong> (Size - IMPORTANT!)<br> | |
| θ₂ = <strong style="color: #ff8c6a;">8.2</strong> (Bedrooms - inflated)<br> | |
| θ₃ = <strong style="color: #ff8c6a;">0.3</strong> (Pool - likely noise)<br> | |
| θ₄ = <strong style="color: #ff8c6a;">-0.1</strong> (Age - weak signal)<br><br> | |
| <strong>Cost (MSE) = 2.5</strong> (good fit but overfitted!) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Add L1 Penalty (λ = 1.0)</strong><br><br> | |
| <strong>New Cost Function:</strong><br> | |
| Cost = MSE + λ × (|θ₁| + |θ₂| + |θ₃| + |θ₄|)<br> | |
| Cost = MSE + 1.0 × (|θ₁| + |θ₂| + |θ₃| + |θ₄|)<br><br> | |
| <strong>Before regularization:</strong><br> | |
| MSE = 2.5<br> | |
| L1 Penalty = 1.0 × (|0.035| + |8.2| + |0.3| + |-0.1|)<br> | |
| L1 Penalty = 1.0 × (0.035 + 8.2 + 0.3 + 0.1) = <strong | |
| style="color: #ff8c6a;">8.635</strong><br> | |
| <strong style="color: #ff8c6a;">Total Cost = 2.5 + 8.635 = 11.135</strong> ❌ Too high! | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Optimization - Shrinking | |
| Coefficients</strong><br><br> | |
| <strong>Gradient Descent Update (simplified):</strong><br> | |
| θⱼ = θⱼ - α × (∂MSE/∂θⱼ + λ × sign(θⱼ))<br><br> | |
| <strong>Key insight:</strong> L1 penalty adds constant <strong style="color: #7ef0d4;">λ × | |
| sign(θⱼ)</strong><br> | |
| → Pushes small coefficients ALL THE WAY to zero!<br><br> | |
| <strong>After L1 optimization (λ = 1.0):</strong><br> | |
| θ₁ = <strong style="color: #7ef0d4;">0.034</strong> (Size - kept, slightly reduced)<br> | |
| θ₂ = <strong style="color: #7ef0d4;">6.5</strong> (Bedrooms - reduced significantly)<br> | |
| θ₃ = <strong style="color: #7ef0d4;">0.0</strong> ← ELIMINATED! (Pool was noise)<br> | |
| θ₄ = <strong style="color: #7ef0d4;">0.0</strong> ← ELIMINATED! (Age was weak)<br><br> | |
| <strong>New costs:</strong><br> | |
| MSE = 2.8 (slightly worse fit)<br> | |
| L1 Penalty = 1.0 × (0.034 + 6.5 + 0 + 0) = 6.534<br> | |
| <strong style="color: #7ef0d4;">Total Cost = 2.8 + 6.534 = 9.334</strong> ✓ BETTER! | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Why L1 Creates Exactly Zero?</strong><br><br> | |
| <strong>Geometric Interpretation:</strong><br> | |
| • L1 constraint: |θ₁| + |θ₂| ≤ budget<br> | |
| • This forms a DIAMOND shape in 2D (sharp corners!)<br> | |
| • MSE contours are ellipses<br> | |
| • Solution touches diamond at CORNERS (where θ₁ or θ₂ = 0)<br><br> | |
| <strong>Numerical example for θ₃ (Pool coefficient):</strong><br> | |
| Original: θ₃ = 0.3<br> | |
| L1 gradient contribution: λ × sign(0.3) = 1.0 × (+1) = 1.0<br> | |
| MSE gradient contribution: ≈ 0.2 (weak)<br><br> | |
| L1 force (1.0) > MSE force (0.2)<br> | |
| → θ₃ gets pushed to 0 and STAYS there! ✓ | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Prediction Comparison</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">New House: 1200 sq ft, 3 bed, Pool, 10 years | |
| old</strong><br><br> | |
| <strong>Without Regularization:</strong><br> | |
| ŷ = 5.0 + 0.035(1200) + 8.2(3) + 0.3(1) + (-0.1)(10)<br> | |
| ŷ = 5.0 + 42 + 24.6 + 0.3 - 1.0<br> | |
| ŷ = <strong style="color: #ff8c6a;">₹70.9L</strong> (uses all features, may be | |
| overfitted)<br><br> | |
| <strong>With L1 Regularization (λ=1.0):</strong><br> | |
| ŷ = 5.0 + 0.034(1200) + 6.5(3) + <strong style="color: #7ef0d4;">0(1)</strong> + <strong | |
| style="color: #7ef0d4;">0(10)</strong><br> | |
| ŷ = 5.0 + 40.8 + 19.5 + 0 + 0<br> | |
| ŷ = <strong style="color: #7ef0d4;">₹65.3L</strong> ✓ (simpler, more generalizable!) | |
| </div> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ L1 Regularization Summary</div> | |
| <div class="callout-content"> | |
| <strong>The Magic of L1:</strong><br> | |
| 1. Adds <strong>|θ₁| + |θ₂| + ...</strong> to cost function<br> | |
| 2. Creates constant gradient: <strong>λ × sign(θⱼ)</strong><br> | |
| 3. Small coefficients get pushed ALL THE WAY to zero<br> | |
| 4. Result: <strong>Automatic feature selection!</strong><br> | |
| 5. Only important features survive<br><br> | |
| <strong style="color: #7ef0d4;">Perfect for high-dimensional data with irrelevant | |
| features!</strong> | |
| </div> | |
| </div> | |
| </div> | |
| <h3>L2 Regularization (Ridge)</h3> | |
| <div class="formula"> | |
| <strong>L2 Penalty:</strong> | |
| Cost = MSE + λ × Σθᵢ² | |
| <br><small>Sum of squared coefficients</small> | |
| </div> | |
| <h4>L2 Effects:</h4> | |
| <ul> | |
| <li><strong>Shrinks coefficients:</strong> Makes them smaller, not zero</li> | |
| <li><strong>Keeps all features:</strong> No automatic selection</li> | |
| <li><strong>Smooth predictions:</strong> Less sensitive to individual features</li> | |
| <li><strong>Use when:</strong> Many correlated features (multicollinearity)</li> | |
| </ul> | |
| <!-- COMPREHENSIVE L2 MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(106, 169, 255, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #6aa9ff; margin-top: 32px;"> | |
| <h3 style="color: #6aa9ff; margin-bottom: 20px;">📐 L2 Regularization: Complete Mathematical | |
| Walkthrough</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's see how L2 shrinks ALL coefficients | |
| smoothly!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Same House Price Dataset (with | |
| multicollinearity)</strong><br><br> | |
| <strong>Dataset:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Size (x₁)</th> | |
| <th style="padding: 8px;">Rooms (x₂)</th> | |
| <th style="padding: 8px;">Sqft/Room (x₃)</th> | |
| <th style="padding: 8px;">Location (x₄)</th> | |
| <th style="padding: 8px;">Price (y)</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">1000</td> | |
| <td style="text-align: center;">5</td> | |
| <td style="text-align: center;">200</td> | |
| <td style="text-align: center;">8</td> | |
| <td style="text-align: center;">₹50L</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">1500</td> | |
| <td style="text-align: center;">6</td> | |
| <td style="text-align: center;">250</td> | |
| <td style="text-align: center;">9</td> | |
| <td style="text-align: center;">₹75L</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">800</td> | |
| <td style="text-align: center;">4</td> | |
| <td style="text-align: center;">200</td> | |
| <td style="text-align: center;">6</td> | |
| <td style="text-align: center;">₹40L</td> | |
| </tr> | |
| </table> | |
| <em style="color: #ff8c6a;">Note: x₁ and x₃ are highly correlated! (Size ≈ Rooms × | |
| Sqft/Room)</em> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Linear Regression WITHOUT | |
| Regularization</strong><br><br> | |
| <strong>Model:</strong> ŷ = θ₀ + θ₁·Size + θ₂·Rooms + θ₃·Sqft/Room + θ₄·Location<br><br> | |
| <strong>Training Result (unstable due to multicollinearity):</strong><br> | |
| θ₀ = 2.0<br> | |
| θ₁ = <strong style="color: #ff8c6a;">0.055</strong> (Size - inflated)<br> | |
| θ₂ = <strong style="color: #ff8c6a;">12.5</strong> (Rooms - VERY inflated)<br> | |
| θ₃ = <strong style="color: #ff8c6a;">-0.048</strong> (Sqft/Room - wrong sign!)<br> | |
| θ₄ = <strong style="color: #7ef0d4;">2.8</strong> (Location - reasonable)<br><br> | |
| <strong>Problem:</strong> Coefficients compensate for each other<br> | |
| → Unstable, sensitive to small data changes<br> | |
| <strong>Cost (MSE) = 1.8</strong> (low training error but poor generalization) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Add L2 Penalty (λ = 1.0)</strong><br><br> | |
| <strong>New Cost Function:</strong><br> | |
| Cost = MSE + λ × (θ₁² + θ₂² + θ₃² + θ₄²)<br> | |
| Cost = MSE + 1.0 × (θ₁² + θ₂² + θ₃² + θ₄²)<br><br> | |
| <strong>Before regularization:</strong><br> | |
| MSE = 1.8<br> | |
| L2 Penalty = 1.0 × (0.055² + 12.5² + (-0.048)² + 2.8²)<br> | |
| L2 Penalty = 1.0 × (0.003 + 156.25 + 0.0023 + 7.84)<br> | |
| L2 Penalty = <strong style="color: #ff8c6a;">164.095</strong><br> | |
| <strong style="color: #ff8c6a;">Total Cost = 1.8 + 164.095 = 165.895</strong> ❌ Huge | |
| penalty! | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Optimization - Proportional | |
| Shrinkage</strong><br><br> | |
| <strong>Gradient Descent Update:</strong><br> | |
| θⱼ = θⱼ - α × (∂MSE/∂θⱼ + <strong style="color: #7ef0d4;">2λθⱼ</strong>)<br><br> | |
| <strong>Key insight:</strong> L2 penalty adds <strong style="color: #7ef0d4;">2λθⱼ</strong> | |
| (proportional to θⱼ!)<br> | |
| → Large coefficients shrink MORE<br> | |
| → Small coefficients shrink LESS<br> | |
| → None go exactly to zero!<br><br> | |
| <strong>After L2 optimization (λ = 1.0):</strong><br> | |
| θ₁ = <strong style="color: #7ef0d4;">0.042</strong> (Size - reduced 24%)<br> | |
| θ₂ = <strong style="color: #7ef0d4;">7.8</strong> (Rooms - reduced 38%! was largest)<br> | |
| θ₃ = <strong style="color: #7ef0d4;">-0.035</strong> (Sqft/Room - reduced 27%)<br> | |
| θ₄ = <strong style="color: #7ef0d4;">2.3</strong> (Location - reduced 18%)<br><br> | |
| <strong>New costs:</strong><br> | |
| MSE = 2.1 (slightly worse fit, acceptable)<br> | |
| L2 Penalty = 1.0 × (0.042² + 7.8² + 0.035² + 2.3²)<br> | |
| L2 Penalty = 1.0 × (0.0018 + 60.84 + 0.0012 + 5.29) = 66.13<br> | |
| <strong style="color: #7ef0d4;">Total Cost = 2.1 + 66.13 = 68.23</strong> ✓ MUCH BETTER! | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Why L2 NEVER Creates Exact Zero?</strong><br><br> | |
| <strong>Mathematical Proof:</strong><br> | |
| Gradient contribution from L2: 2λθⱼ<br><br> | |
| As θⱼ → 0, the L2 gradient → 0 too!<br> | |
| → Shrinkage force weakens near zero<br> | |
| → Coefficient asymptotically approaches zero but never reaches it<br><br> | |
| <strong>Numerical example for θ₂ (Rooms coefficient):</strong><br> | |
| Iteration 1: θ₂ = 12.5 → L2 gradient = 2(1)(12.5) = <strong>25.0</strong> (huge!)<br> | |
| Iteration 50: θ₂ = 8.2 → L2 gradient = 2(1)(8.2) = <strong>16.4</strong> (large)<br> | |
| Iteration 100: θ₂ = 7.8 → L2 gradient = 2(1)(7.8) = <strong>15.6</strong> (moderate)<br> | |
| Iteration 1000: θ₂ = 7.8 → L2 gradient = 2(1)(7.8) = <strong>15.6</strong> ✓ | |
| Converged!<br><br> | |
| <strong style="color: #7ef0d4;">θ₂ never reaches 0, just gets smaller!</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 5: Geometric Interpretation</strong><br><br> | |
| <strong>L2 Constraint:</strong> θ₁² + θ₂² ≤ budget<br> | |
| • This forms a CIRCLE (smooth, no corners!)<br> | |
| • MSE contours are ellipses<br> | |
| • Solution touches circle tangentially<br> | |
| • Circle has NO sharp corners → unlikely to hit axes (θ = 0)<br><br> | |
| <strong>vs L1 (diamond with corners):</strong><br> | |
| L1: Diamond corners → solution hits axes → zeros<br> | |
| L2: Smooth circle → solution anywhere on circle → no zeros | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Handling Multicollinearity</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Why L2 is Perfect for Correlated | |
| Features</strong><br><br> | |
| <strong>Without L2 (multicollinearity problem):</strong><br> | |
| Size and Sqft/Room are correlated:<br> | |
| θ₁ = 0.055, θ₃ = -0.048 (compensating!)<br> | |
| Model equation: 0.055·Size - 0.048·Sqft/Room<br> | |
| → Unstable! Small data change → huge coefficient change<br><br> | |
| <strong>With L2 (λ=1.0):</strong><br> | |
| θ₁ = 0.042, θ₃ = -0.035<br> | |
| Both shrunk proportionally → more stable!<br> | |
| Model: 0.042·Size - 0.035·Sqft/Room<br> | |
| → Even if data changes slightly, coefficients stay reasonable ✓ | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Prediction Comparison</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">New House: 1200 sq ft, 6 rooms, 200 sqft/room, | |
| location=8</strong><br><br> | |
| <strong>Without Regularization:</strong><br> | |
| ŷ = 2.0 + 0.055(1200) + 12.5(6) - 0.048(200) + 2.8(8)<br> | |
| ŷ = 2.0 + 66 + 75 - 9.6 + 22.4<br> | |
| ŷ = <strong style="color: #ff8c6a;">₹155.8L</strong> (wildly inflated! unstable | |
| coefficients)<br><br> | |
| <strong>With L2 Regularization (λ=1.0):</strong><br> | |
| ŷ = 2.0 + 0.042(1200) + 7.8(6) - 0.035(200) + 2.3(8)<br> | |
| ŷ = 2.0 + 50.4 + 46.8 - 7.0 + 18.4<br> | |
| ŷ = <strong style="color: #7ef0d4;">₹110.6L</strong> ✓ (more realistic, stable!) | |
| </div> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 6: Closed-Form Solution (Ridge Regression | |
| Formula)</strong><br><br> | |
| <strong>Amazing fact:</strong> L2 has exact solution!<br><br> | |
| <strong>Normal Equation (no regularization):</strong><br> | |
| θ = (X<sup>T</sup>X)<sup>-1</sup>X<sup>T</sup>y<br><br> | |
| <strong>Ridge Regression (with L2):</strong><br> | |
| θ = (X<sup>T</sup>X + <strong | |
| style="color: #7ef0d4;">λI</strong>)<sup>-1</sup>X<sup>T</sup>y<br><br> | |
| Where I is identity matrix<br> | |
| <strong style="color: #7ef0d4;">The λI term stabilizes X<sup>T</sup>X!</strong><br><br> | |
| <strong>Benefit:</strong> Even if X<sup>T</sup>X is singular (non-invertible),<br> | |
| X<sup>T</sup>X + λI becomes invertible! ✓ | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ L2 Regularization Summary</div> | |
| <div class="callout-content"> | |
| <strong>The Magic of L2:</strong><br> | |
| 1. Adds <strong>θ₁² + θ₂² + ...</strong> to cost function<br> | |
| 2. Creates proportional gradient: <strong>2λθⱼ</strong><br> | |
| 3. Large coefficients shrink MORE, small shrink LESS<br> | |
| 4. NO coefficients go exactly to zero<br> | |
| 5. <strong>Handles multicollinearity beautifully!</strong><br> | |
| 6. Has closed-form solution!<br><br> | |
| <strong style="color: #7ef0d4;">Perfect when all features are potentially useful and | |
| correlated!</strong> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="regularization-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Comparing vanilla, L1, and L2 regularization | |
| effects</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Lambda (λ): <span id="reg-lambda-val">0.1</span></label> | |
| <input type="range" id="reg-lambda-slider" min="0" max="2" step="0.1" value="0.1"> | |
| </div> | |
| </div> | |
| <h3>The Lambda (λ) Parameter</h3> | |
| <ul> | |
| <li><strong>λ = 0:</strong> No regularization (original model, risk of overfitting)</li> | |
| <li><strong>Small λ (0.01):</strong> Weak penalty, slight regularization</li> | |
| <li><strong>Medium λ (1):</strong> Balanced, good generalization</li> | |
| <li><strong>Large λ (100):</strong> Strong penalty, risk of underfitting</li> | |
| </ul> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 L1 vs L2: Quick Guide</div> | |
| <div class="callout-content"> | |
| <strong>Use L1 when:</strong><br> | |
| • You suspect many features are irrelevant<br> | |
| • You want automatic feature selection<br> | |
| • You need interpretability<br> | |
| <br> | |
| <strong>Use L2 when:</strong><br> | |
| • All features might be useful<br> | |
| • Features are highly correlated<br> | |
| • You want smooth, stable predictions<br> | |
| <br> | |
| <strong>Elastic Net:</strong> Combines both L1 and L2! | |
| </div> | |
| </div> | |
| <h3>Practical Example</h3> | |
| <p>Predicting house prices with 10 features (size, bedrooms, age, etc.):</p> | |
| <p><strong>Without regularization:</strong> All features have large, varying coefficients. Model | |
| overfits noise.</p> | |
| <p><strong>With L1:</strong> Only 4 features remain (size, location, bedrooms, age). Others set to | |
| 0. Simpler, more interpretable!</p> | |
| <p><strong>With L2:</strong> All features kept but coefficients shrunk. More stable predictions, | |
| handles correlated features well.</p> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaway</div> | |
| <div class="callout-content"> | |
| Regularization is like adding a "simplicity tax" to your model. Complex models pay more tax, | |
| encouraging simpler solutions that generalize better! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="bias-variance"> | |
| <div class="section-header"> | |
| <h2>9. Bias-Variance Tradeoff</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Every model makes two types of errors: bias and variance. The bias-variance tradeoff is the | |
| fundamental challenge in machine learning - we must balance them!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Bias = systematic error (underfitting)</li> | |
| <li>Variance = sensitivity to training data (overfitting)</li> | |
| <li>Can't minimize both simultaneously</li> | |
| <li>Goal: Find the sweet spot</li> | |
| </ul> | |
| </div> | |
| <h3>Understanding Bias</h3> | |
| <p><strong>Bias</strong> is the error from overly simplistic assumptions. High bias causes | |
| <strong>underfitting</strong>. | |
| </p> | |
| <h4>Characteristics of High Bias:</h4> | |
| <ul> | |
| <li>Model too simple for the problem</li> | |
| <li>High error on training data</li> | |
| <li>High error on test data</li> | |
| <li>Can't capture underlying patterns</li> | |
| <li>Example: Using a straight line for curved data</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">🎯 High Bias Example</div> | |
| <div class="callout-content"> | |
| Trying to fit a parabola with a straight line. No matter how much training data you have, a | |
| line can't capture the curve. That's bias! | |
| </div> | |
| </div> | |
| <h3>Understanding Variance</h3> | |
| <p><strong>Variance</strong> is the error from sensitivity to small fluctuations in training data. | |
| High variance causes <strong>overfitting</strong>.</p> | |
| <h4>Characteristics of High Variance:</h4> | |
| <ul> | |
| <li>Model too complex for the problem</li> | |
| <li>Very low error on training data</li> | |
| <li>High error on test data</li> | |
| <li>Captures noise as if it were pattern</li> | |
| <li>Example: Using 10th-degree polynomial for simple data</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">📊 High Variance Example</div> | |
| <div class="callout-content"> | |
| A wiggly curve that passes through every training point perfectly, including outliers. | |
| Change one data point and the entire curve changes dramatically. That's variance! | |
| </div> | |
| </div> | |
| <h3>The Tradeoff</h3> | |
| <div class="formula"> | |
| <strong>Total Error Decomposition:</strong> | |
| Total Error = Bias² + Variance + Irreducible Error | |
| <br><small>Irreducible error = noise in data (can't be eliminated)</small> | |
| </div> | |
| <p><strong>The tradeoff:</strong></p> | |
| <ul> | |
| <li>Decrease bias → Increase variance (more complex model)</li> | |
| <li>Decrease variance → Increase bias (simpler model)</li> | |
| <li>Goal: Minimize total error by balancing both</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="bias-variance-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Three models showing underfitting, good fit, | |
| and overfitting</p> | |
| </div> | |
| <h3>The Driving Test Analogy</h3> | |
| <p>Think of learning to drive:</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Driving Test Analogy</div> | |
| <ul style="list-style: none; padding: 0;"> | |
| <li | |
| style="padding: 12px; border: none; margin-bottom: 8px; background: rgba(255, 140, 106, 0.1); border-radius: 6px;"> | |
| <strong style="color: #ff8c6a;">High Bias (Underfitting):</strong><br> | |
| Failed practice tests, failed real test<br> | |
| → Can't learn to drive at all | |
| </li> | |
| <li | |
| style="padding: 12px; border: none; margin-bottom: 8px; background: rgba(126, 240, 212, 0.1); border-radius: 6px;"> | |
| <strong style="color: #7ef0d4;">Good Balance:</strong><br> | |
| Passed practice tests, passed real test<br> | |
| → Actually learned to drive! | |
| </li> | |
| <li | |
| style="padding: 12px; border: none; margin-bottom: 8px; background: rgba(255, 140, 106, 0.1); border-radius: 6px;"> | |
| <strong style="color: #ff8c6a;">High Variance (Overfitting):</strong><br> | |
| Perfect on practice tests, failed real test<br> | |
| → Memorized practice, didn't truly learn | |
| </li> | |
| </ul> | |
| </div> | |
| <h3>How to Find the Balance</h3> | |
| <h4>Reduce Bias (if underfitting):</h4> | |
| <ul> | |
| <li>Use more complex model (more features, higher degree polynomial)</li> | |
| <li>Add more features</li> | |
| <li>Reduce regularization</li> | |
| <li>Train longer (more iterations)</li> | |
| </ul> | |
| <h4>Reduce Variance (if overfitting):</h4> | |
| <ul> | |
| <li>Use simpler model (fewer features, lower degree)</li> | |
| <li>Get more training data</li> | |
| <li>Add regularization (L1, L2)</li> | |
| <li>Use cross-validation</li> | |
| <li>Feature selection or dimensionality reduction</li> | |
| </ul> | |
| <h3>Model Complexity Curve</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="complexity-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Error vs model complexity - find the sweet | |
| spot</p> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Detecting Bias vs Variance</div> | |
| <div class="callout-content"> | |
| <strong>High Bias:</strong><br> | |
| Training error: High 🔴<br> | |
| Test error: High 🔴<br> | |
| Gap: Small<br> | |
| <br> | |
| <strong>High Variance:</strong><br> | |
| Training error: Low 🟢<br> | |
| Test error: High 🔴<br> | |
| Gap: Large ⚠️<br> | |
| <br> | |
| <strong>Good Model:</strong><br> | |
| Training error: Low 🟢<br> | |
| Test error: Low 🟢<br> | |
| Gap: Small ✓ | |
| </div> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaway</div> | |
| <div class="callout-content"> | |
| The bias-variance tradeoff is unavoidable. You can't have zero bias AND zero variance. The | |
| art of machine learning is finding the sweet spot where total error is minimized! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: Perceptron (NEW) --> | |
| <div class="section" id="perceptron"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🧠 Neural | |
| Networks</span> The Perceptron</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>The Perceptron is the simplest neural network - just one neuron! It's the building block of all | |
| deep learning and was invented in 1958. Understanding it is key to understanding neural | |
| networks.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Single artificial neuron</li> | |
| <li>Takes multiple inputs, produces one output</li> | |
| <li>Uses weights to determine importance of inputs</li> | |
| <li>Applies activation function to make decision</li> | |
| </ul> | |
| </div> | |
| <h3>How a Perceptron Works</h3> | |
| <div class="formula"> | |
| <strong>1. Weighted Sum:</strong> z = w₁x₁ + w₂x₂ + ... + wₙxₙ + b<br><br> | |
| <strong>2. Activation:</strong> output = activation(z)<br><br> | |
| <strong>Step Function (Original):</strong> output = 1 if z > 0, else 0<br> | |
| <strong>Sigmoid (Modern):</strong> output = 1/(1 + e⁻ᶻ) | |
| </div> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(255, 140, 106, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #ff8c6a; margin-top: 32px;"> | |
| <h3 style="color: #ff8c6a; margin-bottom: 20px;">📐 Complete Mathematical Derivation: Perceptron | |
| </h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's build a simple AND gate with a perceptron! | |
| </p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Learn the AND logic gate</strong><br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">x₁</th> | |
| <th style="padding: 8px;">x₂</th> | |
| <th style="padding: 8px;">AND Output</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #ff8c6a;">0</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center; color: #ff8c6a;">0</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #ff8c6a;">0</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center; color: #7ef0d4;">1</td> | |
| </tr> | |
| </table> | |
| <strong>Given weights:</strong> w₁ = 0.5, w₂ = 0.5, b = -0.7 | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Compute Weighted Sum for Each Input</strong><br><br> | |
| <strong>Formula:</strong> z = w₁x₁ + w₂x₂ + b<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">x₁</th> | |
| <th style="padding: 8px;">x₂</th> | |
| <th style="padding: 8px;">z = 0.5x₁ + 0.5x₂ - 0.7</th> | |
| <th style="padding: 8px;">z value</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">0.5(0) + 0.5(0) - 0.7</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-0.7</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">0.5(0) + 0.5(1) - 0.7</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-0.2</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">0.5(1) + 0.5(0) - 0.7</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-0.2</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">0.5(1) + 0.5(1) - 0.7</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>+0.3</strong></td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Apply Step Activation Function</strong><br><br> | |
| <strong>Step Function:</strong> output = 1 if z > 0, else 0<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">x₁</th> | |
| <th style="padding: 8px;">x₂</th> | |
| <th style="padding: 8px;">z</th> | |
| <th style="padding: 8px;">z > 0?</th> | |
| <th style="padding: 8px;">Output</th> | |
| <th style="padding: 8px;">Expected</th> | |
| <th style="padding: 8px;">Match?</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">-0.7</td> | |
| <td style="text-align: center;">No</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">0</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">-0.2</td> | |
| <td style="text-align: center;">No</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">-0.2</td> | |
| <td style="text-align: center;">No</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center;">0</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">1</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">+0.3</td> | |
| <td style="text-align: center;">Yes</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center;">1</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓</strong></td> | |
| </tr> | |
| </table> | |
| <strong style="color: #7ef0d4; font-size: 18px;">🎉 The perceptron perfectly learns the AND | |
| gate!</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Perceptron Learning Rule (How to Find | |
| Weights)</strong><br><br> | |
| <strong>Update Rule:</strong> w_new = w_old + α × (target - output) × input<br><br> | |
| Where α = learning rate (e.g., 0.1)<br><br> | |
| <strong>Example update:</strong><br> | |
| If prediction was 0 but target was 1 (error = 1):<br> | |
| w₁_new = 0.5 + 0.1 × (1 - 0) × 1 = 0.5 + 0.1 = 0.6<br><br> | |
| <em style="color: #a9b4c2;">Weights increase for inputs that should have been positive!</em> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Perceptron Summary</div> | |
| <div class="callout-content"> | |
| <strong>The Perceptron Algorithm:</strong><br> | |
| 1. Initialize weights randomly<br> | |
| 2. For each training example: compute z = Σ(wᵢxᵢ) + b<br> | |
| 3. Apply activation: output = step(z)<br> | |
| 4. Update weights if wrong: w += α × error × input<br> | |
| 5. Repeat until all examples correct (or max iterations) | |
| </div> | |
| </div> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Perceptron Limitation</div> | |
| <div class="callout-content"> | |
| A single perceptron can only learn <strong>linearly separable</strong> patterns. It CANNOT | |
| learn XOR! | |
| This is why we need multi-layer networks (next section). | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: Neural Networks MLP (NEW) --> | |
| <div class="section" id="neural-networks"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🧠 Neural | |
| Networks</span> Multi-Layer Perceptron (MLP)</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>A Multi-Layer Perceptron (MLP) stacks multiple layers of neurons to learn complex, non-linear | |
| patterns. This is the foundation of deep learning!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Network Architecture</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Input Layer:</strong> Receives features (one neuron per feature)</li> | |
| <li><strong>Hidden Layer(s):</strong> Learn abstract representations</li> | |
| <li><strong>Output Layer:</strong> Produces final prediction</li> | |
| <li><strong>Weights:</strong> Connect neurons between layers</li> | |
| </ul> | |
| </div> | |
| <h3>Activation Functions</h3> | |
| <div class="formula"> | |
| <strong>Sigmoid:</strong> σ(z) = 1/(1 + e⁻ᶻ) → output (0, 1)<br> | |
| <strong>ReLU:</strong> f(z) = max(0, z) → output [0, ∞)<br> | |
| <strong>Tanh:</strong> tanh(z) = (eᶻ - e⁻ᶻ)/(eᶻ + e⁻ᶻ) → output (-1, 1)<br> | |
| <strong>Softmax:</strong> For multi-class classification | |
| </div> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(126, 240, 212, 0.1), rgba(255, 140, 106, 0.1)); border: 2px solid #7ef0d4; margin-top: 32px;"> | |
| <h3 style="color: #7ef0d4; margin-bottom: 20px;">📐 Complete Mathematical Derivation: Forward | |
| Propagation</h3> | |
| <p style="color: #ff8c6a; font-weight: bold;">Let's trace through a small neural network | |
| step-by-step!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Network Architecture: 2 → 2 → 1</strong><br><br> | |
| • Input layer: 2 neurons (x₁, x₂)<br> | |
| • Hidden layer: 2 neurons (h₁, h₂)<br> | |
| • Output layer: 1 neuron (ŷ)<br><br> | |
| <strong>Given Weights:</strong><br> | |
| W₁ (input→hidden): [[0.1, 0.3], [0.2, 0.4]]<br> | |
| b₁ (hidden bias): [0.1, 0.1]<br> | |
| W₂ (hidden→output): [[0.5], [0.6]]<br> | |
| b₂ (output bias): [0.2] | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Forward Pass - Input to Hidden | |
| Layer</strong><br><br> | |
| <strong>Input:</strong> x = [1.0, 2.0]<br><br> | |
| <strong>Hidden neuron h₁:</strong><br> | |
| z₁ = w₁₁×x₁ + w₁₂×x₂ + b₁<br> | |
| z₁ = 0.1×1.0 + 0.2×2.0 + 0.1<br> | |
| z₁ = 0.1 + 0.4 + 0.1 = <strong style="color: #7ef0d4;">0.6</strong><br> | |
| h₁ = sigmoid(0.6) = 1/(1 + e⁻⁰·⁶) = <strong style="color: #7ef0d4;">0.646</strong><br><br> | |
| <strong>Hidden neuron h₂:</strong><br> | |
| z₂ = w₂₁×x₁ + w₂₂×x₂ + b₂<br> | |
| z₂ = 0.3×1.0 + 0.4×2.0 + 0.1<br> | |
| z₂ = 0.3 + 0.8 + 0.1 = <strong style="color: #7ef0d4;">1.2</strong><br> | |
| h₂ = sigmoid(1.2) = 1/(1 + e⁻¹·²) = <strong style="color: #7ef0d4;">0.769</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Forward Pass - Hidden to Output | |
| Layer</strong><br><br> | |
| <strong>Hidden layer output:</strong> h = [0.646, 0.769]<br><br> | |
| <strong>Output neuron:</strong><br> | |
| z_out = w₁×h₁ + w₂×h₂ + b<br> | |
| z_out = 0.5×0.646 + 0.6×0.769 + 0.2<br> | |
| z_out = 0.323 + 0.461 + 0.2 = <strong style="color: #7ef0d4;">0.984</strong><br><br> | |
| ŷ = sigmoid(0.984) = 1/(1 + e⁻⁰·⁹⁸⁴)<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">ŷ = 0.728 (Final Prediction!)</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Calculate Loss</strong><br><br> | |
| <strong>Binary Cross-Entropy Loss:</strong><br> | |
| L = -[y×log(ŷ) + (1-y)×log(1-ŷ)]<br><br> | |
| If true label y = 1:<br> | |
| L = -[1×log(0.728) + 0×log(0.272)]<br> | |
| L = -log(0.728)<br> | |
| <strong style="color: #7ef0d4;">L = 0.317 (Loss value)</strong><br><br> | |
| <em style="color: #a9b4c2;">Lower loss = better prediction!</em> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Backpropagation (Gradient | |
| Calculation)</strong><br><br> | |
| <strong>Chain Rule:</strong> ∂L/∂w = ∂L/∂ŷ × ∂ŷ/∂z × ∂z/∂w<br><br> | |
| <strong>Output layer gradient:</strong><br> | |
| ∂L/∂ŷ = -(y/ŷ) + (1-y)/(1-ŷ) = (ŷ - y) for sigmoid<br> | |
| δ_output = 0.728 - 1 = <strong style="color: #ff8c6a;">-0.272</strong><br><br> | |
| <strong>Hidden layer gradient:</strong><br> | |
| δ_hidden = δ_output × W₂ × h × (1-h)<br><br> | |
| <em style="color: #a9b4c2;">Gradients flow backward to update all weights!</em> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Neural Network Training Summary</div> | |
| <div class="callout-content"> | |
| <strong>The Full Training Loop:</strong><br> | |
| 1. <strong>Forward Pass:</strong> Input → Hidden → Output (calculate prediction)<br> | |
| 2. <strong>Loss Calculation:</strong> Compare prediction to true value<br> | |
| 3. <strong>Backward Pass:</strong> Calculate gradients using chain rule<br> | |
| 4. <strong>Update Weights:</strong> w = w - α × gradient<br> | |
| 5. <strong>Repeat</strong> for many epochs until loss minimizes! | |
| </div> | |
| </div> | |
| </div> | |
| <!-- DETAILED BACKPROPAGATION SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(255, 140, 106, 0.15), rgba(106, 169, 255, 0.15)); border: 2px solid #ff8c6a; margin-top: 32px;"> | |
| <h3 style="color: #ff8c6a; margin-bottom: 20px;">📐 Complete Backpropagation Derivation | |
| (Line-by-Line)</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's derive backpropagation step-by-step using | |
| the network from the forward pass example!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Recap: Network Architecture & Forward Pass | |
| Results</strong><br><br> | |
| <strong>Network:</strong> 2 inputs → 2 hidden → 1 output<br> | |
| <strong>Input:</strong> x = [1.0, 2.0], True label: y = 1<br><br> | |
| <strong>Forward Pass Results:</strong><br> | |
| • Hidden layer: h₁ = 0.646, h₂ = 0.769<br> | |
| • Output: ŷ = 0.728<br> | |
| • Loss: L = 0.317 | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Output Layer Error (δ_output)</strong><br><br> | |
| <strong>Goal:</strong> Calculate ∂L/∂z_out (gradient of loss w.r.t. output before | |
| activation)<br><br> | |
| <strong>Using Chain Rule:</strong><br> | |
| δ_output = ∂L/∂z_out = ∂L/∂ŷ × ∂ŷ/∂z_out<br><br> | |
| <strong>For Binary Cross-Entropy + Sigmoid, this simplifies to:</strong><br> | |
| δ_output = ŷ - y<br> | |
| δ_output = 0.728 - 1<br> | |
| <strong style="color: #7ef0d4;">δ_output = -0.272</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Gradients for Hidden→Output Weights | |
| (W₂)</strong><br><br> | |
| <strong>Formula:</strong> ∂L/∂W₂ = δ_output × h (hidden layer output)<br><br> | |
| <strong>Calculation:</strong><br> | |
| ∂L/∂w₁(h→o) = δ_output × h₁ = -0.272 × 0.646 = <strong | |
| style="color: #7ef0d4;">-0.176</strong><br> | |
| ∂L/∂w₂(h→o) = δ_output × h₂ = -0.272 × 0.769 = <strong | |
| style="color: #7ef0d4;">-0.209</strong><br><br> | |
| <strong>Bias gradient:</strong><br> | |
| ∂L/∂b₂ = δ_output = <strong style="color: #7ef0d4;">-0.272</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Backpropagate Error to Hidden Layer | |
| (δ_hidden)</strong><br><br> | |
| <strong>The Key Insight:</strong> Hidden neurons contributed to output error based on their | |
| weights!<br><br> | |
| <strong>Formula:</strong> δ_hidden = (W₂ᵀ × δ_output) ⊙ σ'(z_hidden)<br><br> | |
| <strong>Sigmoid derivative:</strong> σ'(z) = σ(z) × (1 - σ(z)) = h × (1 - h)<br><br> | |
| <strong>For hidden neuron h₁:</strong><br> | |
| σ'(z₁) = h₁ × (1 - h₁) = 0.646 × (1 - 0.646) = 0.646 × 0.354 = <strong>0.229</strong><br> | |
| δ₁ = w₁(h→o) × δ_output × σ'(z₁)<br> | |
| δ₁ = 0.5 × (-0.272) × 0.229 = <strong style="color: #7ef0d4;">-0.031</strong><br><br> | |
| <strong>For hidden neuron h₂:</strong><br> | |
| σ'(z₂) = h₂ × (1 - h₂) = 0.769 × (1 - 0.769) = 0.769 × 0.231 = <strong>0.178</strong><br> | |
| δ₂ = w₂(h→o) × δ_output × σ'(z₂)<br> | |
| δ₂ = 0.6 × (-0.272) × 0.178 = <strong style="color: #7ef0d4;">-0.029</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Gradients for Input→Hidden Weights | |
| (W₁)</strong><br><br> | |
| <strong>Formula:</strong> ∂L/∂W₁ = δ_hidden × x (input)<br><br> | |
| <strong>Input:</strong> x = [1.0, 2.0]<br><br> | |
| <strong>Gradients for weights to h₁:</strong><br> | |
| ∂L/∂w₁₁ = δ₁ × x₁ = -0.031 × 1.0 = <strong style="color: #7ef0d4;">-0.031</strong><br> | |
| ∂L/∂w₁₂ = δ₁ × x₂ = -0.031 × 2.0 = <strong style="color: #7ef0d4;">-0.062</strong><br><br> | |
| <strong>Gradients for weights to h₂:</strong><br> | |
| ∂L/∂w₂₁ = δ₂ × x₁ = -0.029 × 1.0 = <strong style="color: #7ef0d4;">-0.029</strong><br> | |
| ∂L/∂w₂₂ = δ₂ × x₂ = -0.029 × 2.0 = <strong style="color: #7ef0d4;">-0.058</strong><br><br> | |
| <strong>Bias gradients:</strong><br> | |
| ∂L/∂b₁ = δ₁ = -0.031, ∂L/∂b₂ = δ₂ = -0.029 | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 5: Update All Weights</strong><br><br> | |
| <strong>Learning rate:</strong> α = 0.1<br><br> | |
| <strong>Update Rule:</strong> w_new = w_old - α × ∂L/∂w<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Weight</th> | |
| <th style="padding: 8px;">Old Value</th> | |
| <th style="padding: 8px;">Gradient</th> | |
| <th style="padding: 8px;">Update</th> | |
| <th style="padding: 8px;">New Value</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">w₁₁</td> | |
| <td style="text-align: center;">0.1</td> | |
| <td style="text-align: center;">-0.031</td> | |
| <td style="text-align: center;">0.1 - 0.1×(-0.031)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.103</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">w₁₂</td> | |
| <td style="text-align: center;">0.2</td> | |
| <td style="text-align: center;">-0.062</td> | |
| <td style="text-align: center;">0.2 - 0.1×(-0.062)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.206</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">w₂₁</td> | |
| <td style="text-align: center;">0.3</td> | |
| <td style="text-align: center;">-0.029</td> | |
| <td style="text-align: center;">0.3 - 0.1×(-0.029)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.303</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">w₂₂</td> | |
| <td style="text-align: center;">0.4</td> | |
| <td style="text-align: center;">-0.058</td> | |
| <td style="text-align: center;">0.4 - 0.1×(-0.058)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.406</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">w₁(h→o)</td> | |
| <td style="text-align: center;">0.5</td> | |
| <td style="text-align: center;">-0.176</td> | |
| <td style="text-align: center;">0.5 - 0.1×(-0.176)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.518</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">w₂(h→o)</td> | |
| <td style="text-align: center;">0.6</td> | |
| <td style="text-align: center;">-0.209</td> | |
| <td style="text-align: center;">0.6 - 0.1×(-0.209)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.621</strong></td> | |
| </tr> | |
| </table> | |
| <em style="color: #a9b4c2;">Weights increased because gradient was negative (we want to | |
| increase output toward 1)</em> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Backpropagation Summary</div> | |
| <div class="callout-content"> | |
| <strong>The Algorithm:</strong><br> | |
| 1. <strong>Forward pass:</strong> Calculate all activations from input → output<br> | |
| 2. <strong>Calculate output error:</strong> δ_output = ŷ - y (for sigmoid + BCE)<br> | |
| 3. <strong>Backpropagate error:</strong> δ_hidden = (Wᵀ × δ_next) ⊙ σ'(z)<br> | |
| 4. <strong>Calculate gradients:</strong> ∂L/∂W = δ × (input to that layer)ᵀ<br> | |
| 5. <strong>Update weights:</strong> W = W - α × ∂L/∂W<br><br> | |
| <em style="color: #7ef0d4;">This is iterated thousands of times until the loss | |
| converges!</em> | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.neural_network <span style="color: #ff8c6a;">import</span> MLPClassifier | |
| <span style="color: #6aa9ff;"># Create neural network</span> | |
| mlp = MLPClassifier( | |
| hidden_layer_sizes=(<span style="color: #7ef0d4;">100</span>, <span style="color: #7ef0d4;">50</span>), <span style="color: #6aa9ff;"># 2 hidden layers</span> | |
| activation=<span style="color: #7ef0d4;">'relu'</span>, | |
| max_iter=<span style="color: #7ef0d4;">500</span> | |
| ) | |
| <span style="color: #6aa9ff;"># Train</span> | |
| mlp.fit(X_train, y_train) | |
| <span style="color: #6aa9ff;"># Predict</span> | |
| predictions = mlp.predict(X_test)</pre> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="cross-validation"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Evaluation</span> Cross-Validation</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Cross-validation gives more reliable performance estimates by testing your model on multiple | |
| different splits of the data!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Splits data into K folds</li> | |
| <li>Trains K times, each with different test fold</li> | |
| <li>Averages results for robust estimate</li> | |
| <li>Reduces variance in performance estimate</li> | |
| </ul> | |
| </div> | |
| <h3>The Problem with Simple Train-Test Split</h3> | |
| <p>With a single 80-20 split:</p> | |
| <ul> | |
| <li>Performance depends on which data you randomly picked</li> | |
| <li>Might get lucky/unlucky with the split</li> | |
| <li>20% of data wasted (not used for training)</li> | |
| <li>One number doesn't tell you about variance</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Single Split Problem</div> | |
| <div class="callout-content"> | |
| You test once and get 85% accuracy. Is that good? Or did you just get lucky with an easy | |
| test set? Without multiple tests, you don't know! | |
| </div> | |
| </div> | |
| <h3>K-Fold Cross-Validation</h3> | |
| <p>The solution: Split data into K folds and test K times!</p> | |
| <div class="formula"> | |
| <strong>K-Fold Algorithm:</strong> | |
| 1. Split data into K equal folds<br> | |
| 2. For i = 1 to K:<br> | |
| - Use fold i as test set<br> | |
| - Use all other folds as training set<br> | |
| - Train model and record accuracyᵢ<br> | |
| 3. Final score = mean(accuracy₁, ..., accuracyₖ)<br> | |
| 4. Also report std dev for confidence | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="cv-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> 3-Fold Cross-Validation - each fold serves as | |
| test set once</p> | |
| </div> | |
| <h3>Example: 3-Fold CV</h3> | |
| <p>Dataset with 12 samples (A through L), split into 3 folds:</p> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Fold</th> | |
| <th>Test Set</th> | |
| <th>Training Set</th> | |
| <th>Accuracy</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>1</td> | |
| <td>A, B, C, D</td> | |
| <td>E, F, G, H, I, J, K, L</td> | |
| <td>0.96</td> | |
| </tr> | |
| <tr> | |
| <td>2</td> | |
| <td>E, F, G, H</td> | |
| <td>A, B, C, D, I, J, K, L</td> | |
| <td>0.84</td> | |
| </tr> | |
| <tr> | |
| <td>3</td> | |
| <td>I, J, K, L</td> | |
| <td>A, B, C, D, E, F, G, H</td> | |
| <td>0.90</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="formula"> | |
| <strong>Final Score:</strong> | |
| Mean = (0.96 + 0.84 + 0.90) / 3 = 0.90 (90%)<br> | |
| Std Dev = 0.049<br> | |
| <br> | |
| <strong>Report:</strong> 90% ± 5% | |
| </div> | |
| <h3>Choosing K</h3> | |
| <ul> | |
| <li><strong>K=5:</strong> Most common, good balance</li> | |
| <li><strong>K=10:</strong> More reliable, standard in research</li> | |
| <li><strong>K=n (Leave-One-Out):</strong> Maximum data usage, but expensive</li> | |
| <li><strong>Larger K:</strong> More computation, less bias, more variance</li> | |
| <li><strong>Smaller K:</strong> Less computation, more bias, less variance</li> | |
| </ul> | |
| <h3>Stratified K-Fold</h3> | |
| <p>For classification with imbalanced classes, use <strong>stratified</strong> K-fold to maintain | |
| class proportions in each fold!</p> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Example</div> | |
| <div class="callout-content"> | |
| Dataset: 80% class 0, 20% class 1<br> | |
| <br> | |
| <strong>Regular K-fold:</strong> One fold might have 90% class 0, another 70%<br> | |
| <strong>Stratified K-fold:</strong> Every fold has 80% class 0, 20% class 1 ✓ | |
| </div> | |
| </div> | |
| <h3>Leave-One-Out Cross-Validation (LOOCV)</h3> | |
| <p>Special case where K = n (number of samples):</p> | |
| <ul> | |
| <li>Each sample is test set once</li> | |
| <li>Train on n-1 samples, test on 1</li> | |
| <li>Repeat n times</li> | |
| <li>Maximum use of training data</li> | |
| <li>Very expensive for large datasets</li> | |
| </ul> | |
| <h3>Benefits of Cross-Validation</h3> | |
| <ul> | |
| <li>✓ More reliable performance estimate</li> | |
| <li>✓ Uses all data for both training and testing</li> | |
| <li>✓ Reduces variance in estimate</li> | |
| <li>✓ Detects overfitting (high variance across folds)</li> | |
| <li>✓ Better for small datasets</li> | |
| </ul> | |
| <h3>Drawbacks</h3> | |
| <ul> | |
| <li>✗ Computationally expensive (train K times)</li> | |
| <li>✗ Not suitable for time series (can't shuffle)</li> | |
| <li>✗ Still need final train-test split for final model</li> | |
| </ul> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Best Practice</div> | |
| <div class="callout-content"> | |
| 1. Use cross-validation to evaluate models and tune hyperparameters<br> | |
| 2. Once you pick the best model, train on ALL training data<br> | |
| 3. Test once on held-out test set for final unbiased estimate<br> | |
| <br> | |
| <strong>Never</strong> use test set during cross-validation! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="preprocessing"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔍 | |
| Unsupervised - Preprocessing</span> Data Preprocessing</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Raw data is messy! Data preprocessing cleans and transforms data into a format that machine | |
| learning algorithms can use effectively.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Steps</div> | |
| <ul class="info-card-list"> | |
| <li>Handle missing values</li> | |
| <li>Encode categorical variables</li> | |
| <li>Scale/normalize features</li> | |
| <li>Split data properly</li> | |
| </ul> | |
| </div> | |
| <h3>1. Handling Missing Values</h3> | |
| <p>Real-world data often has missing values. We can't just ignore them!</p> | |
| <h4>Strategies:</h4> | |
| <ul> | |
| <li><strong>Drop rows:</strong> If only few values missing (<5%)</li> | |
| <li><strong>Mean imputation:</strong> Replace with column mean (numerical)</li> | |
| <li><strong>Median imputation:</strong> Replace with median (robust to outliers)</li> | |
| <li><strong>Mode imputation:</strong> Replace with most frequent (categorical)</li> | |
| <li><strong>Forward/backward fill:</strong> Use previous/next value (time series)</li> | |
| <li><strong>Predictive imputation:</strong> Train model to predict missing values</li> | |
| </ul> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Warning</div> | |
| <div class="callout-content"> | |
| Never drop columns with many missing values without investigation! The missingness itself | |
| might be informative (e.g., income not reported might correlate with high income). | |
| </div> | |
| </div> | |
| <h3>2. Encoding Categorical Variables</h3> | |
| <p>Most ML algorithms need numerical input. We must convert categories to numbers!</p> | |
| <h4>One-Hot Encoding</h4> | |
| <p>Creates binary column for each category. Use for <strong>nominal</strong> data (no order).</p> | |
| <div class="formula"> | |
| <strong>Example:</strong> | |
| Color: ["Red", "Blue", "Green", "Blue"]<br> | |
| <br> | |
| Becomes three columns:<br> | |
| Red: [1, 0, 0, 0]<br> | |
| Blue: [0, 1, 0, 1]<br> | |
| Green: [0, 0, 1, 0] | |
| </div> | |
| <h4>Label Encoding</h4> | |
| <p>Assigns integer to each category. Use for <strong>ordinal</strong> data (has order).</p> | |
| <div class="formula"> | |
| <strong>Example:</strong> | |
| Size: ["Small", "Large", "Medium", "Small"]<br> | |
| <br> | |
| Becomes: [0, 2, 1, 0]<br> | |
| <small>(Small=0, Medium=1, Large=2)</small> | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Don't Mix Them Up!</div> | |
| <div class="callout-content"> | |
| Never use label encoding for nominal data! If you encode ["Red", "Blue", "Green"] as [0, 1, | |
| 2], the model thinks Green > Blue > Red, which is meaningless! | |
| </div> | |
| </div> | |
| <h3>3. Feature Scaling</h3> | |
| <p>Different features have different scales. Age (0-100) vs Income ($0-$1M). This causes problems! | |
| </p> | |
| <h4>Why Scale?</h4> | |
| <ul> | |
| <li>Gradient descent converges faster</li> | |
| <li>Distance-based algorithms (KNN, SVM) need it</li> | |
| <li>Regularization treats features equally</li> | |
| <li>Neural networks train better</li> | |
| </ul> | |
| <h4>StandardScaler (Z-score normalization)</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| z = (x - μ) / σ | |
| <br><small>where:<br>μ = mean of feature<br>σ = standard deviation<br>Result: mean=0, | |
| std=1</small> | |
| </div> | |
| <p><strong>Example:</strong> [10, 20, 30, 40, 50]</p> | |
| <p>μ = 30, σ = 15.81</p> | |
| <p>Scaled: [-1.26, -0.63, 0, 0.63, 1.26]</p> | |
| <h4>MinMaxScaler</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| x' = (x - min) / (max - min) | |
| <br><small>Result: range [0, 1]</small> | |
| </div> | |
| <p><strong>Example:</strong> [10, 20, 30, 40, 50]</p> | |
| <p>Scaled: [0, 0.25, 0.5, 0.75, 1.0]</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="scaling-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Feature distributions before and after | |
| scaling</p> | |
| </div> | |
| <h3>Critical: fit_transform vs transform</h3> | |
| <p>This is where many beginners make mistakes!</p> | |
| <div class="formula"> | |
| <strong>fit_transform():</strong><br> | |
| 1. Learns parameters (μ, σ, min, max) from data<br> | |
| 2. Transforms the data<br> | |
| <strong>Use on:</strong> Training data ONLY<br> | |
| <br> | |
| <strong>transform():</strong><br> | |
| 1. Uses already-learned parameters<br> | |
| 2. Transforms the data<br> | |
| <strong>Use on:</strong> Test data, new data | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ DATA LEAKAGE!</div> | |
| <div class="callout-content"> | |
| <strong>WRONG:</strong><br> | |
| scaler.fit(test_data) # Learns from test data!<br> | |
| <br> | |
| <strong>CORRECT:</strong><br> | |
| scaler.fit(train_data) # Learn from train only<br> | |
| train_scaled = scaler.transform(train_data)<br> | |
| test_scaled = scaler.transform(test_data)<br> | |
| <br> | |
| If you fit on test data, you're "peeking" at the answers! | |
| </div> | |
| </div> | |
| <h3>4. Train-Test Split</h3> | |
| <p>Always split data BEFORE any preprocessing that learns parameters!</p> | |
| <div class="formula"> | |
| <strong>Correct Order:</strong><br> | |
| 1. Split data → train (80%), test (20%)<br> | |
| 2. Handle missing values (fit on train)<br> | |
| 3. Encode categories (fit on train)<br> | |
| 4. Scale features (fit on train)<br> | |
| 5. Train model<br> | |
| 6. Test model (using same transformations) | |
| </div> | |
| <h3>Complete Pipeline Example</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 300px"> | |
| <canvas id="pipeline-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Complete preprocessing pipeline</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Golden Rules</div> | |
| <div class="callout-content"> | |
| 1. <strong>Split first!</strong> Before any preprocessing<br> | |
| 2. <strong>Fit on train only!</strong> Never on test<br> | |
| 3. <strong>Transform both!</strong> Apply same transformations to test<br> | |
| 4. <strong>Pipeline everything!</strong> Use scikit-learn Pipeline to avoid mistakes<br> | |
| 5. <strong>Save your scaler!</strong> You'll need it for new predictions | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="section" id="loss-functions"> | |
| <div class="section-header"> | |
| <h2>12. Loss Functions</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Loss functions measure how wrong our predictions are. Different problems need different loss | |
| functions! The choice dramatically affects what your model learns.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Loss = how wrong a single prediction is</li> | |
| <li>Cost = average loss over all samples</li> | |
| <li>Regression: MSE, MAE, RMSE</li> | |
| <li>Classification: Log Loss, Hinge Loss</li> | |
| </ul> | |
| </div> | |
| <h3>Loss Functions for Regression</h3> | |
| <h4>Mean Squared Error (MSE)</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| MSE = (1/n) × Σ(y - ŷ)² | |
| <br><small>where:<br>y = actual value<br>ŷ = predicted value<br>n = number of samples</small> | |
| </div> | |
| <h5>Characteristics:</h5> | |
| <ul> | |
| <li><strong>Squares errors:</strong> Penalizes large errors heavily</li> | |
| <li><strong>Always positive:</strong> Minimum is 0 (perfect predictions)</li> | |
| <li><strong>Differentiable:</strong> Great for gradient descent</li> | |
| <li><strong>Sensitive to outliers:</strong> One huge error dominates</li> | |
| <li><strong>Units:</strong> Squared units (harder to interpret)</li> | |
| </ul> | |
| <p><strong>Example:</strong> Predictions [12, 19, 32], Actual [10, 20, 30]</p> | |
| <p>Errors: [2, -1, 2]</p> | |
| <p>Squared: [4, 1, 4]</p> | |
| <p>MSE = (4 + 1 + 4) / 3 = <strong>3.0</strong></p> | |
| <h4>Mean Absolute Error (MAE)</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| MAE = (1/n) × Σ|y - ŷ| | |
| <br><small>Absolute value of errors</small> | |
| </div> | |
| <h5>Characteristics:</h5> | |
| <ul> | |
| <li><strong>Linear penalty:</strong> All errors weighted equally</li> | |
| <li><strong>Robust to outliers:</strong> One huge error doesn't dominate</li> | |
| <li><strong>Interpretable units:</strong> Same units as target</li> | |
| <li><strong>Not differentiable at 0:</strong> Slightly harder to optimize</li> | |
| </ul> | |
| <p><strong>Example:</strong> Predictions [12, 19, 32], Actual [10, 20, 30]</p> | |
| <p>Errors: [2, -1, 2]</p> | |
| <p>Absolute: [2, 1, 2]</p> | |
| <p>MAE = (2 + 1 + 2) / 3 = <strong>1.67</strong></p> | |
| <h4>Root Mean Squared Error (RMSE)</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| RMSE = √MSE | |
| <br><small>Square root of MSE</small> | |
| </div> | |
| <h5>Characteristics:</h5> | |
| <ul> | |
| <li><strong>Same units as target:</strong> More interpretable than MSE</li> | |
| <li><strong>Still sensitive to outliers:</strong> But less than MSE</li> | |
| <li><strong>Common in competitions:</strong> Kaggle, etc.</li> | |
| </ul> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="loss-comparison-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Comparing MSE, MAE, and their response to | |
| errors</p> | |
| </div> | |
| <h3>Loss Functions for Classification</h3> | |
| <h4>Log Loss (Cross-Entropy)</h4> | |
| <div class="formula"> | |
| <strong>Binary Cross-Entropy:</strong> | |
| Loss = -(1/n) × Σ[y·log(ŷ) + (1-y)·log(1-ŷ)] | |
| <br><small>where:<br>y ∈ {0, 1} = actual label<br>ŷ ∈ (0, 1) = predicted probability</small> | |
| </div> | |
| <h5>Characteristics:</h5> | |
| <ul> | |
| <li><strong>For probabilities:</strong> Output must be [0, 1]</li> | |
| <li><strong>Heavily penalizes confident wrong predictions:</strong> Good!</li> | |
| <li><strong>Convex:</strong> No local minima, easy to optimize</li> | |
| <li><strong>Probabilistic interpretation:</strong> Maximum likelihood</li> | |
| </ul> | |
| <p><strong>Example:</strong> y=1 (spam), predicted p=0.9</p> | |
| <p>Loss = -[1·log(0.9) + 0·log(0.1)] = -log(0.9) = <strong>0.105</strong> (low, good!)</p> | |
| <p><strong>Example:</strong> y=1 (spam), predicted p=0.1</p> | |
| <p>Loss = -[1·log(0.1) + 0·log(0.9)] = -log(0.1) = <strong>2.303</strong> (high, bad!)</p> | |
| <h4>Hinge Loss (for SVM)</h4> | |
| <div class="formula"> | |
| <strong>Formula:</strong> | |
| Loss = max(0, 1 - y·score) | |
| <br><small>where:<br>y ∈ {-1, +1}<br>score = w·x + b</small> | |
| </div> | |
| <h5>Characteristics:</h5> | |
| <ul> | |
| <li><strong>Margin-based:</strong> Encourages confident predictions</li> | |
| <li><strong>Zero loss for correct & confident:</strong> When y·score ≥ 1</li> | |
| <li><strong>Linear penalty:</strong> For violations</li> | |
| <li><strong>Used in SVM:</strong> Maximizes margin</li> | |
| </ul> | |
| <h3>When to Use Which Loss?</h3> | |
| <div class="info-card" style="background: rgba(106, 169, 255, 0.1);"> | |
| <div class="info-card-title" style="color: #6aa9ff;">Regression Problems</div> | |
| <ul style="list-style: none; padding: 0;"> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>MSE:</strong> Default choice, smooth optimization, use when outliers are errors | |
| </li> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>MAE:</strong> When you have outliers that are valid data points | |
| </li> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>RMSE:</strong> When you need interpretable metric in original units | |
| </li> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>Huber Loss:</strong> Combines MSE and MAE - best of both worlds! | |
| </li> | |
| </ul> | |
| </div> | |
| <div class="info-card" style="background: rgba(126, 240, 212, 0.1); margin-top: 16px;"> | |
| <div class="info-card-title" style="color: #7ef0d4;">Classification Problems</div> | |
| <ul style="list-style: none; padding: 0;"> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>Log Loss:</strong> Default for binary/multi-class, when you need probabilities | |
| </li> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>Hinge Loss:</strong> For SVM, when you want maximum margin | |
| </li> | |
| <li style="padding: 8px 0; border: none;"> | |
| <strong>Focal Loss:</strong> For highly imbalanced datasets | |
| </li> | |
| </ul> | |
| </div> | |
| <h3>Visualizing Loss Curves</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="loss-curves-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> How different losses respond to errors</p> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Impact of Outliers</div> | |
| <div class="callout-content"> | |
| Imagine predictions [100, 102, 98, 150] for actuals [100, 100, 100, 100]:<br> | |
| <br> | |
| <strong>MSE:</strong> (0 + 4 + 4 + 2500) / 4 = 627 ← Dominated by outlier!<br> | |
| <strong>MAE:</strong> (0 + 2 + 2 + 50) / 4 = 13.5 ← More balanced<br> | |
| <br> | |
| MSE is 48× larger because it squares the huge error! | |
| </div> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaways</div> | |
| <div class="callout-content"> | |
| 1. Loss function choice affects what your model learns<br> | |
| 2. MSE penalizes large errors more than MAE<br> | |
| 3. Use MAE when outliers are valid, MSE when they're errors<br> | |
| 4. Log loss for classification with probabilities<br> | |
| 5. Always plot your errors to understand what's happening!<br> | |
| <br> | |
| <strong>The loss function IS your model's objective!</strong> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 13: Finding Optimal K in KNN --> | |
| <div class="section" id="optimal-k"> | |
| <div class="section-header"> | |
| <h2>13. Finding Optimal K in KNN</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Choosing the right K value is critical for KNN performance! Too small causes overfitting, too | |
| large causes underfitting. Let's explore systematic methods to find the optimal K.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Methods</div> | |
| <ul class="info-card-list"> | |
| <li>Elbow Method: Plot accuracy vs K, find the "elbow"</li> | |
| <li>Cross-Validation: Test multiple K values with k-fold CV</li> | |
| <li>Grid Search: Systematically test K values</li> | |
| <li>Avoid K=1 (overfits) and K=n (underfits)</li> | |
| </ul> | |
| </div> | |
| <h3>Method 1: Elbow Method</h3> | |
| <p>Test different K values and plot performance. Look for the "elbow" where adding more neighbors | |
| doesn't help much.</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px; position: relative;"> | |
| <canvas id="elbow-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Elbow curve showing optimal K at the bend | |
| </p> | |
| </div> | |
| <h3>Method 2: Cross-Validation Approach</h3> | |
| <p>For each K value, run k-fold cross-validation and calculate mean accuracy. Choose K with highest | |
| mean accuracy.</p> | |
| <div class="formula"> | |
| <strong>Cross-Validation Process:</strong> | |
| for K in [1, 2, 3, ..., 20]:<br> | |
| accuracies = []<br> | |
| for fold in [1, 2, 3]:<br> | |
| train model with K neighbors<br> | |
| test on validation fold<br> | |
| accuracies.append(accuracy)<br> | |
| mean_accuracy[K] = mean(accuracies)<br> | |
| <br> | |
| optimal_K = argmax(mean_accuracy) | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="cv-k-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Cross-validation accuracies heatmap for | |
| different K values</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Why Cross-Validation is Better</div> | |
| <div class="callout-content"> | |
| Single train-test split might be lucky/unlucky. Cross-validation gives you: | |
| <ul> | |
| <li>Mean accuracy (average performance)</li> | |
| <li>Standard deviation (how stable is K?)</li> | |
| <li>Confidence in your choice</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <h3>Practical Guidelines</h3> | |
| <ul> | |
| <li><strong>Start with K = √n:</strong> Good rule of thumb</li> | |
| <li><strong>Try odd K values:</strong> Avoids ties in binary classification</li> | |
| <li><strong>Test range [1, 20]:</strong> Covers most practical scenarios</li> | |
| <li><strong>Check for stability:</strong> Low std dev across folds</li> | |
| </ul> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Real-World Example</div> | |
| <div class="callout-content"> | |
| <strong>Iris Dataset (150 samples):</strong><br> | |
| √150 ≈ 12, so start testing around K=11, K=13, K=15<br> | |
| After CV: K=5 gives 96% ± 2% → Optimal choice!<br> | |
| K=1 gives 94% ± 8% → Too much variance<br> | |
| K=25 gives 88% ± 1% → Too smooth, underfitting | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 14: Hyperparameter Tuning --> | |
| <div class="section" id="hyperparameter-tuning"> | |
| <div class="section-header"> | |
| <h2>14. Hyperparameter Tuning with GridSearch</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Hyperparameters control how your model learns. Unlike model parameters (learned from data), | |
| hyperparameters are set BEFORE training. GridSearch systematically finds the best combination! | |
| </p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Common Hyperparameters</div> | |
| <ul class="info-card-list"> | |
| <li>Learning rate (α) - Gradient Descent step size</li> | |
| <li>K - Number of neighbors in KNN</li> | |
| <li>C, gamma - SVM parameters</li> | |
| <li>Max depth - Decision Tree depth</li> | |
| <li>Number of trees - Random Forest</li> | |
| </ul> | |
| </div> | |
| <h3>GridSearch Explained</h3> | |
| <p>GridSearch tests ALL combinations of hyperparameters you specify. It's exhaustive but guarantees | |
| finding the best combination in your grid.</p> | |
| <div class="formula"> | |
| <strong>Example: SVM GridSearch</strong> | |
| param_grid = {<br> | |
| 'C': [0.1, 1, 10, 100],<br> | |
| 'gamma': [0.001, 0.01, 0.1, 1],<br> | |
| 'kernel': ['linear', 'rbf']<br> | |
| }<br> | |
| <br> | |
| Total combinations: 4 × 4 × 2 = 32<br> | |
| With 5-fold CV: 32 × 5 = 160 model trainings! | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="gridsearch-heatmap" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> GridSearch heatmap showing accuracy for C vs | |
| gamma combinations</p> | |
| </div> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Select Model:</label> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="grid-model" value="svm" checked> SVM</label> | |
| <label><input type="radio" name="grid-model" value="rf"> Random Forest</label> | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Performance Surface (3D View)</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="param-surface" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> 3D surface showing how parameters affect | |
| performance</p> | |
| </div> | |
| <h3>When GridSearch Fails</h3> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ The Curse of Dimensionality</div> | |
| <div class="callout-content"> | |
| <strong>Problem:</strong> Too many hyperparameters = exponential search space<br> | |
| <br> | |
| <strong>Example:</strong> 5 hyperparameters × 10 values each = 100,000 combinations!<br> | |
| <br> | |
| <strong>Solutions:</strong><br> | |
| • RandomSearchCV: Random sampling (faster, often good enough)<br> | |
| • Bayesian Optimization: Smart search using previous results<br> | |
| • Halving GridSearch: Eliminate poor performers early | |
| </div> | |
| </div> | |
| <h3>Best Practices</h3> | |
| <ul> | |
| <li><strong>Start coarse:</strong> Wide range, few values (e.g., C: [0.1, 1, 10, 100])</li> | |
| <li><strong>Then refine:</strong> Narrow range around best (e.g., C: [5, 7, 9, 11])</li> | |
| <li><strong>Use cross-validation:</strong> Avoid overfitting to validation set</li> | |
| <li><strong>Log scale for wide ranges:</strong> [0.001, 0.01, 0.1, 1, 10, 100]</li> | |
| <li><strong>Consider computation time:</strong> More folds = more reliable but slower</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- Section 15: Naive Bayes (COMPREHENSIVE WITH MATH) --> | |
| <div class="section" id="naive-bayes"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> Naive Bayes Classification</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Naive Bayes is a probabilistic classifier based on Bayes' Theorem. Despite its "naive" | |
| independence assumption, it works surprisingly well for text classification and other tasks! | |
| We'll cover both Categorical and Gaussian Naive Bayes with complete mathematical solutions.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Based on Bayes' Theorem from probability theory</li> | |
| <li>Assumes features are independent (naive assumption)</li> | |
| <li>Very fast training and prediction</li> | |
| <li>Works well with high-dimensional data</li> | |
| </ul> | |
| </div> | |
| <h3>Bayes' Theorem</h3> | |
| <div class="formula"> | |
| <strong>The Foundation:</strong> | |
| P(Class|Features) = P(Features|Class) × P(Class) / P(Features)<br> | |
| <br> | |
| ↓ ↓ ↓ ↓<br> | |
| Posterior Likelihood Prior Evidence<br> | |
| (What we want) (From | |
| data) (Baseline) (Normalizer) | |
| </div> | |
| <h3>The Naive Independence Assumption</h3> | |
| <p>"Naive" because we assume all features are independent given the class:</p> | |
| <div class="formula"> | |
| <strong>Independence Assumption:</strong> | |
| P(x₁, x₂, ..., xₙ | Class) = P(x₁|Class) × P(x₂|Class) × ... × P(xₙ|Class)<br> | |
| <br> | |
| <small>This is often NOT true in reality, but works anyway!</small> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="bayes-theorem-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Bayes' Theorem visual explanation</p> | |
| </div> | |
| <h3>Real-World Example: Email Spam Detection</h3> | |
| <p>Let's classify an email with words: ["free", "winner", "click"]</p> | |
| <div class="formula"> | |
| <strong>Training Data:</strong><br> | |
| • 300 spam emails (30%)<br> | |
| • 700 not-spam emails (70%)<br> | |
| <br> | |
| <strong>Word frequencies:</strong><br> | |
| P("free" | spam) = 0.8 (appears in 80% of spam)<br> | |
| P("free" | not-spam) = 0.1 (appears in 10% of not-spam)<br> | |
| <br> | |
| P("winner" | spam) = 0.7<br> | |
| P("winner" | not-spam) = 0.05<br> | |
| <br> | |
| P("click" | spam) = 0.6<br> | |
| P("click" | not-spam) = 0.2 | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="spam-classification" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Spam classification calculation | |
| step-by-step</p> | |
| </div> | |
| <h3>Step-by-Step Calculation</h3> | |
| <div class="callout info"> | |
| <div class="callout-title">📧 Classifying Our Email</div> | |
| <div class="callout-content"> | |
| <strong>P(spam | features):</strong><br> | |
| = P("free"|spam) × P("winner"|spam) × P("click"|spam) × P(spam)<br> | |
| = 0.8 × 0.7 × 0.6 × 0.3<br> | |
| = 0.1008<br> | |
| <br> | |
| <strong>P(not-spam | features):</strong><br> | |
| = P("free"|not-spam) × P("winner"|not-spam) × P("click"|not-spam) × P(not-spam)<br> | |
| = 0.1 × 0.05 × 0.2 × 0.7<br> | |
| = 0.0007<br> | |
| <br> | |
| <strong>Prediction:</strong> 0.1008 > 0.0007 → SPAM! 📧❌ | |
| </div> | |
| </div> | |
| <h3>Why It Works Despite Wrong Assumption</h3> | |
| <ul> | |
| <li><strong>Don't need exact probabilities:</strong> Just need correct ranking</li> | |
| <li><strong>Errors cancel out:</strong> Multiple features reduce impact</li> | |
| <li><strong>Simple is robust:</strong> Fewer parameters = less overfitting</li> | |
| <li><strong>Fast:</strong> Just multiply probabilities!</li> | |
| </ul> | |
| <h3>Comparison with Other Classifiers</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Aspect</th> | |
| <th>Naive Bayes</th> | |
| <th>Logistic Reg</th> | |
| <th>SVM</th> | |
| <th>KNN</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Speed</td> | |
| <td>Very Fast</td> | |
| <td>Fast</td> | |
| <td>Slow</td> | |
| <td>Very Slow</td> | |
| </tr> | |
| <tr> | |
| <td>Works with Little Data</td> | |
| <td>Yes</td> | |
| <td>Yes</td> | |
| <td>No</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>Interpretable</td> | |
| <td>Very</td> | |
| <td>Yes</td> | |
| <td>No</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>Handles Non-linear</td> | |
| <td>Yes</td> | |
| <td>No</td> | |
| <td>Yes</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>High Dimensions</td> | |
| <td>Excellent</td> | |
| <td>Good</td> | |
| <td>Good</td> | |
| <td>Poor</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h3>🎯 PART A: Categorical Naive Bayes (Step-by-Step from PDF)</h3> | |
| <h4>Dataset: Tennis Play Prediction</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Outlook</th> | |
| <th>Temperature</th> | |
| <th>Play</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Sunny</td> | |
| <td>Hot</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>Sunny</td> | |
| <td>Mild</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>Cloudy</td> | |
| <td>Hot</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>Rainy</td> | |
| <td>Mild</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>Rainy</td> | |
| <td>Cool</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>Cloudy</td> | |
| <td>Cool</td> | |
| <td>Yes</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Problem:</strong> Predict whether to play tennis when Outlook=Rainy and Temperature=Hot | |
| </p> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Calculate Prior Probabilities</div> | |
| <div class="step-calculation"> | |
| Count occurrences in training data:<br> | |
| • Play=Yes appears 4 times out of 6 total<br> | |
| • Play=No appears 2 times out of 6 total<br> | |
| <br> | |
| <strong>Calculation:</strong><br> | |
| P(Yes) = 4/6 = <strong>0.667 (66.7%)</strong><br> | |
| P(No) = 2/6 = <strong>0.333 (33.3%)</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Calculate Conditional Probabilities (Before Smoothing)</div> | |
| <div class="step-calculation"> | |
| <strong>For Outlook = "Rainy":</strong><br> | |
| • Count (Rainy AND Yes) = 2 examples<br> | |
| • Count (Yes) = 4 total<br> | |
| • P(Rainy|Yes) = 2/4 = <strong>0.5</strong><br> | |
| <br> | |
| • Count (Rainy AND No) = 0 examples ❌<br> | |
| • Count (No) = 2 total<br> | |
| • P(Rainy|No) = 0/2 = <strong>0</strong> ⚠️ <span style="color: #ff8c6a;">ZERO PROBABILITY | |
| PROBLEM!</span><br> | |
| <br> | |
| <strong>For Temperature = "Hot":</strong><br> | |
| • P(Hot|Yes) = 1/4 = <strong>0.25</strong><br> | |
| • P(Hot|No) = 1/2 = <strong>0.5</strong> | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Step 3: Apply Bayes' Theorem (Initial)</strong><br> | |
| <br> | |
| P(Yes|Rainy,Hot) = P(Yes) × P(Rainy|Yes) × P(Hot|Yes)<br> | |
| = | |
| 0.667 × 0.5 × 0.25<br> | |
| = | |
| 0.0833<br> | |
| <br> | |
| P(No|Rainy,Hot) = P(No) × P(Rainy|No) × P(Hot|No)<br> | |
| = | |
| 0.333 × 0 × 0.5<br> | |
| = | |
| 0 ❌ Problem! | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Zero Probability Problem</div> | |
| <div class="callout-content"> | |
| When P(Rainy|No) = 0, the entire probability becomes 0! This is unrealistic - just because | |
| we haven't seen "Rainy" with "No" in our training data doesn't mean it's impossible. We need | |
| <strong>Laplace Smoothing</strong>! | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Apply Laplace Smoothing (α = 1)</div> | |
| <div class="step-calculation"> | |
| <strong>Smoothed formula:</strong><br> | |
| P(x|c) = (count(x,c) + α) / (count(c) + α × num_categories)<br> | |
| <br> | |
| <strong>For Outlook</strong> (3 categories: Sunny, Cloudy, Rainy):<br> | |
| P(Rainy|Yes) = (2 + 1) / (4 + 1×3)<br> | |
| = | |
| 3/7<br> | |
| = | |
| <strong>0.429</strong> ✓<br> | |
| <br> | |
| P(Rainy|No) = (0 + 1) / (2 + 1×3)<br> | |
| = 1/5<br> | |
| = | |
| <strong>0.2</strong> ✓ <span style="color: #7ef0d4;">Fixed the zero!</span><br> | |
| <br> | |
| <strong>For Temperature</strong> (3 categories: Hot, Mild, Cool):<br> | |
| P(Hot|Yes) = (1 + 1) / (4 + 1×3) = 2/7 = <strong>0.286</strong><br> | |
| P(Hot|No) = (1 + 1) / (2 + 1×3) = 2/5 = <strong>0.4</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 5: Recalculate with Smoothing</div> | |
| <div class="step-calculation"> | |
| <strong>P(Yes|Rainy,Hot):</strong><br> | |
| = P(Yes) × P(Rainy|Yes) × P(Hot|Yes)<br> | |
| = 0.667 × 0.429 × 0.286<br> | |
| = <strong>0.0818</strong><br> | |
| <br> | |
| <strong>P(No|Rainy,Hot):</strong><br> | |
| = P(No) × P(Rainy|No) × P(Hot|No)<br> | |
| = 0.333 × 0.2 × 0.4<br> | |
| = <strong>0.0266</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 6: Normalize to Get Final Probabilities</div> | |
| <div class="step-calculation"> | |
| <strong>Sum of probabilities:</strong><br> | |
| Sum = 0.0818 + 0.0266 = <strong>0.1084</strong><br> | |
| <br> | |
| <strong>Normalize:</strong><br> | |
| P(Yes|Rainy,Hot) = 0.0818 / 0.1084<br> | |
| = | |
| <strong style="color: #7ef0d4;">0.755 (75.5%)</strong><br> | |
| <br> | |
| P(No|Rainy,Hot) = 0.0266 / 0.1084<br> | |
| = | |
| <strong style="color: #ff8c6a;">0.245 (24.5%)</strong><br> | |
| <br> | |
| <div | |
| style="background: rgba(126, 240, 212, 0.2); padding: 16px; border-radius: 8px; margin-top: 12px;"> | |
| <strong style="color: #7ef0d4; font-size: 20px;">✅ FINAL PREDICTION: YES (Play | |
| Tennis!)</strong><br> | |
| <span style="color: #a9b4c2; font-size: 14px;">Confidence: 75.5%</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="categorical-nb-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Categorical Naive Bayes calculation | |
| visualization</p> | |
| </div> | |
| <h3>🎯 PART B: Gaussian Naive Bayes (Step-by-Step from PDF)</h3> | |
| <h4>Dataset: 2D Classification</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>ID</th> | |
| <th>X₁</th> | |
| <th>X₂</th> | |
| <th>Class</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>A</td> | |
| <td>1.0</td> | |
| <td>2.0</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>B</td> | |
| <td>2.0</td> | |
| <td>1.0</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>C</td> | |
| <td>1.5</td> | |
| <td>1.8</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>D</td> | |
| <td>3.0</td> | |
| <td>3.0</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>E</td> | |
| <td>3.5</td> | |
| <td>2.8</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>F</td> | |
| <td>2.9</td> | |
| <td>3.2</td> | |
| <td>No</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Problem:</strong> Classify test point [X₁=2.0, X₂=2.0]</p> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Calculate Mean and Variance for Each Class</div> | |
| <div class="step-calculation"> | |
| <strong>Class "Yes" (samples A, B, C):</strong><br> | |
| X₁ values: [1.0, 2.0, 1.5]<br> | |
| μ₁(Yes) = (1.0 + 2.0 + 1.5) / 3 = <strong>1.5</strong><br> | |
| σ₁²(Yes) = [(1-1.5)² + (2-1.5)² + (1.5-1.5)²] / 3<br> | |
| = [0.25 + 0.25 + 0] / 3<br> | |
| = <strong>0.166</strong><br> | |
| <br> | |
| X₂ values: [2.0, 1.0, 1.8]<br> | |
| μ₂(Yes) = (2.0 + 1.0 + 1.8) / 3 = <strong>1.6</strong><br> | |
| σ₂²(Yes) = [(2-1.6)² + (1-1.6)² + (1.8-1.6)²] / 3<br> | |
| = [0.16 + 0.36 + 0.04] / 3<br> | |
| = <strong>0.186</strong><br> | |
| <br> | |
| <strong>Class "No" (samples D, E, F):</strong><br> | |
| X₁ values: [3.0, 3.5, 2.9]<br> | |
| μ₁(No) = (3.0 + 3.5 + 2.9) / 3 = <strong>3.133</strong><br> | |
| σ₁²(No) = <strong>0.0688</strong><br> | |
| <br> | |
| X₂ values: [3.0, 2.8, 3.2]<br> | |
| μ₂(No) = (3.0 + 2.8 + 3.2) / 3 = <strong>3.0</strong><br> | |
| σ₂²(No) = <strong>0.0266</strong> | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Step 2: Gaussian Probability Density Function</strong><br> | |
| <br> | |
| P(x|μ,σ²) = (1/√(2πσ²)) × exp(-(x-μ)²/(2σ²))<br> | |
| <br> | |
| This gives us the probability density at point x given mean μ and variance σ² | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Calculate P(X₁=2.0 | Class) using Gaussian PDF</div> | |
| <div class="step-calculation"> | |
| <strong>For Class "Yes" (μ=1.5, σ²=0.166):</strong><br> | |
| P(2.0|Yes) = (1/√(2π × 0.166)) × exp(-(2.0-1.5)²/(2 × 0.166))<br> | |
| <br> | |
| Step-by-step:<br> | |
| • Normalization: 1/√(2π × 0.166) = 1/√1.043 = 1/1.021 = <strong>0.9772</strong><br> | |
| • Exponent: -(2.0-1.5)²/(2 × 0.166) = -(0.5)²/0.332 = -0.25/0.332 = | |
| <strong>-0.753</strong><br> | |
| • e^(-0.753) = <strong>0.471</strong><br> | |
| • Final: 0.9772 × 0.471 = <strong style="color: #7ef0d4;">0.460</strong><br> | |
| <br> | |
| <strong>For Class "No" (μ=3.133, σ²=0.0688):</strong><br> | |
| P(2.0|No) = (1/√(2π × 0.0688)) × exp(-(2.0-3.133)²/(2 × 0.0688))<br> | |
| <br> | |
| Step-by-step:<br> | |
| • Normalization: 1/√(2π × 0.0688) = <strong>1.523</strong><br> | |
| • Exponent: -(2.0-3.133)²/(2 × 0.0688) = -(-1.133)²/0.1376 = -1.283/0.1376 = | |
| <strong>-9.333</strong><br> | |
| • e^(-9.333) = <strong>0.000088</strong><br> | |
| • Final: 1.523 × 0.000088 = <strong style="color: #ff8c6a;">0.000134</strong><br> | |
| <br> | |
| <span style="color: #7ef0d4;">• Point (2.0, ?) is MUCH more likely to be "Yes"!</span> | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Step 4: Calculate P(X₂=2.0 | Class)</strong><br> | |
| <br> | |
| <strong>For "Yes":</strong><br> | |
| P(2.0|Yes) = (1/√(2π×0.186)) × exp(-(2.0-1.6)²/(2×0.186))<br> | |
| = 0.923 × exp(-0.430)<br> | |
| = 0.923 × 0.651<br> | |
| = 0.601<br> | |
| <br> | |
| <strong>For "No":</strong><br> | |
| P(2.0|No) = (1/√(2π×0.0266)) × exp(-(2.0-3.0)²/(2×0.0266))<br> | |
| = 2.449 × exp(-18.797)<br> | |
| = 2.449 × 0.0000000614<br> | |
| = 0.00000015 | |
| </div> | |
| <div class="formula"> | |
| <strong>Step 5: Combine with Prior (assume equal priors)</strong><br> | |
| <br> | |
| P(Yes) = P(No) = 0.5<br> | |
| <br> | |
| P(Yes|X) ∝ P(Yes) × P(X₁=2.0|Yes) × P(X₂=2.0|Yes)<br> | |
| = 0.5 × 0.460 × 0.601<br> | |
| = 0.138<br> | |
| <br> | |
| P(No|X) ∝ P(No) × P(X₁=2.0|No) × P(X₂=2.0|No)<br> | |
| = 0.5 × 0.000134 × 0.00000015<br> | |
| = 0.00000000001 | |
| </div> | |
| <div class="formula"> | |
| <strong>Step 6: Normalize</strong><br> | |
| <br> | |
| Sum = 0.138 + 0.00000000001 ≈ 0.138<br> | |
| <br> | |
| P(Yes|X) = 0.138 / 0.138 ≈ 1.0 (99.99%)<br> | |
| P(No|X) ≈ 0.0 (0.01%)<br> | |
| <br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">Prediction: YES ✅</strong> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gaussian-nb-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Gaussian Naive Bayes with decision boundary | |
| </p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ When to Use Naive Bayes</div> | |
| <div class="callout-content"> | |
| <strong>Categorical NB:</strong> Discrete features (text, categories)<br> | |
| <strong>Gaussian NB:</strong> Continuous features (measurements, coordinates)<br> | |
| <br> | |
| <strong>Perfect for:</strong><br> | |
| • Text classification (spam detection, sentiment analysis)<br> | |
| • Document categorization<br> | |
| • Real-time prediction (very fast)<br> | |
| • High-dimensional data<br> | |
| • Small training datasets<br> | |
| <br> | |
| <strong>Avoid when:</strong><br> | |
| • Features are highly correlated<br> | |
| • Need probability calibration<br> | |
| • Complex feature interactions matter | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.naive_bayes <span style="color: #ff8c6a;">import</span> GaussianNB, MultinomialNB | |
| <span style="color: #6aa9ff;"># For continuous features (e.g., measurements)</span> | |
| gnb = GaussianNB() | |
| gnb.fit(X_train, y_train) | |
| predictions = gnb.predict(X_test) | |
| <span style="color: #6aa9ff;"># For text/count data (e.g., TF-IDF features)</span> | |
| <span style="color: #ff8c6a;">from</span> sklearn.feature_extraction.text <span style="color: #ff8c6a;">import</span> CountVectorizer | |
| <span style="color: #6aa9ff;"># Convert text to word counts</span> | |
| vectorizer = CountVectorizer() | |
| X_train_counts = vectorizer.fit_transform(X_train_text) | |
| X_test_counts = vectorizer.transform(X_test_text) | |
| <span style="color: #6aa9ff;"># Train Multinomial NB (good for text)</span> | |
| mnb = MultinomialNB(alpha=<span style="color: #7ef0d4;">1.0</span>) <span style="color: #6aa9ff;"># Laplace smoothing</span> | |
| mnb.fit(X_train_counts, y_train) | |
| <span style="color: #6aa9ff;"># Predict & get probabilities</span> | |
| predictions = mnb.predict(X_test_counts) | |
| probabilities = mnb.predict_proba(X_test_counts)</pre> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 16: K-means Clustering --> | |
| <div class="section" id="kmeans"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔍 | |
| Unsupervised - Clustering</span> K-means Clustering</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>K-means is an unsupervised learning algorithm that groups data into K clusters. Each cluster has | |
| a centroid (center point), and points are assigned to the nearest centroid. Perfect for customer | |
| segmentation, image compression, and pattern discovery!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Unsupervised: No labels needed!</li> | |
| <li>K = number of clusters (you choose)</li> | |
| <li>Minimizes Within-Cluster Sum of Squares (WCSS)</li> | |
| <li>Iterative: Updates centroids until convergence</li> | |
| </ul> | |
| </div> | |
| <h3>🎯 Step-by-Step K-means Algorithm (from PDF)</h3> | |
| <h4>Dataset: 6 Points in 2D Space</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Point</th> | |
| <th>X</th> | |
| <th>Y</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>A</td> | |
| <td>1</td> | |
| <td>2</td> | |
| </tr> | |
| <tr> | |
| <td>B</td> | |
| <td>1.5</td> | |
| <td>1.8</td> | |
| </tr> | |
| <tr> | |
| <td>C</td> | |
| <td>5</td> | |
| <td>8</td> | |
| </tr> | |
| <tr> | |
| <td>D</td> | |
| <td>8</td> | |
| <td>8</td> | |
| </tr> | |
| <tr> | |
| <td>E</td> | |
| <td>1</td> | |
| <td>0.6</td> | |
| </tr> | |
| <tr> | |
| <td>F</td> | |
| <td>9</td> | |
| <td>11</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Goal:</strong> Group into K=2 clusters</p> | |
| <p><strong>Initial Centroids:</strong> c₁ = [3, 4], c₂ = [5, 1]</p> | |
| <div class="formula"> | |
| <strong>Distance Formula (Euclidean):</strong><br> | |
| d(point, centroid) = √[(x₁-x₂)² + (y₁-y₂)²] | |
| </div> | |
| <h4>Iteration 1</h4> | |
| <div class="formula"> | |
| <strong>Step 1: Calculate Distances to All Centroids</strong><br> | |
| <br> | |
| <strong>Point A (1, 2):</strong><br> | |
| d(A, c₁) = √[(1-3)² + (2-4)²] = √[4+4] = √8 = 2.83<br> | |
| d(A, c₂) = √[(1-5)² + (2-1)²] = √[16+1] = √17 = 4.12<br> | |
| → Assign to c₁ (closer)<br> | |
| <br> | |
| <strong>Point B (1.5, 1.8):</strong><br> | |
| d(B, c₁) = √[(1.5-3)² + (1.8-4)²] = √[2.25+4.84] = 2.66<br> | |
| d(B, c₂) = √[(1.5-5)² + (1.8-1)²] = √[12.25+0.64] = 3.59<br> | |
| → Assign to c₁<br> | |
| <br> | |
| <strong>Point C (5, 8):</strong><br> | |
| d(C, c₁) = √[(5-3)² + (8-4)²] = √[4+16] = 4.47<br> | |
| d(C, c₂) = √[(5-5)² + (8-1)²] = √[0+49] = 7.0<br> | |
| → Assign to c₁<br> | |
| <br> | |
| <strong>Point D (8, 8):</strong><br> | |
| d(D, c₁) = √[(8-3)² + (8-4)²] = √[25+16] = 6.40<br> | |
| d(D, c₂) = √[(8-5)² + (8-1)²] = √[9+49] = 7.62<br> | |
| → Assign to c₁<br> | |
| <br> | |
| <strong>Point E (1, 0.6):</strong><br> | |
| d(E, c₁) = √[(1-3)² + (0.6-4)²] = √[4+11.56] = 3.94<br> | |
| d(E, c₂) = √[(1-5)² + (0.6-1)²] = √[16+0.16] = 4.02<br> | |
| → Assign to c₁<br> | |
| <br> | |
| <strong>Point F (9, 11):</strong><br> | |
| d(F, c₁) = √[(9-3)² + (11-4)²] = √[36+49] = 9.22<br> | |
| d(F, c₂) = √[(9-5)² + (11-1)²] = √[16+100] = 10.77<br> | |
| → Assign to c₁<br> | |
| <br> | |
| <strong>Result:</strong> Cluster 1 = {A, B, C, D, E, F}, Cluster 2 = {} | |
| </div> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Poor Initial Centroids!</div> | |
| <div class="callout-content"> | |
| All points assigned to c₁! This happens with bad initialization. Let's try better initial | |
| centroids for the algorithm to work properly. | |
| </div> | |
| </div> | |
| <p><strong>Better Initial Centroids:</strong> c₁ = [1, 1], c₂ = [8, 9]</p> | |
| <div class="formula"> | |
| <strong>Iteration 1 (Revised):</strong><br> | |
| <br> | |
| Cluster 1: {A, B, E} → c₁_new = mean = [(1+1.5+1)/3, (2+1.8+0.6)/3] = [1.17, 1.47]<br> | |
| Cluster 2: {C, D, F} → c₂_new = mean = [(5+8+9)/3, (8+8+11)/3] = [7.33, 9.00]<br> | |
| <br> | |
| <strong>WCSS Calculation:</strong><br> | |
| WCSS₁ = d²(A,c₁) + d²(B,c₁) + d²(E,c₁)<br> | |
| = (1-1.17)²+(2-1.47)² + (1.5-1.17)²+(1.8-1.47)² + | |
| (1-1.17)²+(0.6-1.47)²<br> | |
| = 0.311 + 0.218 + 0.786 = 1.315<br> | |
| <br> | |
| WCSS₂ = d²(C,c₂) + d²(D,c₂) + d²(F,c₂)<br> | |
| = (5-7.33)²+(8-9)² + (8-7.33)²+(8-9)² + | |
| (9-7.33)²+(11-9)²<br> | |
| = 6.433 + 1.447 + 6.789 = 14.669<br> | |
| <br> | |
| <strong>Total WCSS = 1.315 + 14.669 = 15.984</strong> | |
| </div> | |
| <div class="formula"> | |
| <strong>Iteration 2:</strong><br> | |
| <br> | |
| Using c₁ = [1.17, 1.47] and c₂ = [7.33, 9.00], recalculate distances...<br> | |
| <br> | |
| Result: Same assignments! Centroids don't change.<br> | |
| <strong>✓ Converged!</strong> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="kmeans-viz-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> K-means clustering visualization with | |
| centroid movement</p> | |
| </div> | |
| <h3>Finding Optimal K: The Elbow Method</h3> | |
| <p>How do we choose K? Try different values and plot WCSS!</p> | |
| <div class="formula"> | |
| <strong>WCSS for Different K Values:</strong><br> | |
| <br> | |
| K=1: WCSS = 50.0 (all in one cluster)<br> | |
| K=2: WCSS = 18.0<br> | |
| K=3: WCSS = 10.0 ← Elbow point!<br> | |
| K=4: WCSS = 8.0<br> | |
| K=5: WCSS = 7.0<br> | |
| <br> | |
| <strong>Rule:</strong> Choose K at the "elbow" where WCSS stops decreasing rapidly | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="kmeans-elbow-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Elbow method - optimal K is where the curve | |
| bends</p> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 K-means Tips</div> | |
| <div class="callout-content"> | |
| <strong>Advantages:</strong><br> | |
| ✓ Simple and fast<br> | |
| ✓ Works well with spherical clusters<br> | |
| ✓ Scales to large datasets<br> | |
| <br> | |
| <strong>Disadvantages:</strong><br> | |
| ✗ Need to specify K in advance<br> | |
| ✗ Sensitive to initial centroids (use K-means++!)<br> | |
| ✗ Assumes spherical clusters<br> | |
| ✗ Sensitive to outliers<br> | |
| <br> | |
| <strong>Solutions:</strong><br> | |
| • Use elbow method for K<br> | |
| • Use K-means++ initialization<br> | |
| • Run multiple times with different initializations | |
| </div> | |
| </div> | |
| <h3>Real-World Applications</h3> | |
| <ul> | |
| <li><strong>Customer Segmentation:</strong> Group customers by behavior</li> | |
| <li><strong>Image Compression:</strong> Reduce colors in images</li> | |
| <li><strong>Document Clustering:</strong> Group similar articles</li> | |
| <li><strong>Anomaly Detection:</strong> Points far from centroids are outliers</li> | |
| <li><strong>Feature Learning:</strong> Learn representations for neural networks</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- Section 17: Decision Tree Regression (FROM PDF) --> | |
| <div class="section" id="decision-tree-regression"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Regression</span> Decision Tree Regression</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Decision Tree Regression predicts continuous values by recursively splitting data to minimize | |
| variance. Unlike classification trees that use entropy, regression trees use variance reduction! | |
| </p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Splits based on variance reduction (not entropy)</li> | |
| <li>Leaf nodes predict mean of samples</li> | |
| <li>Test all split points to find best</li> | |
| <li>Recursive partitioning until stopping criteria</li> | |
| </ul> | |
| </div> | |
| <h3>🎯 Complete Mathematical Solution (From PDF)</h3> | |
| <h4>Dataset: House Price Prediction</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>ID</th> | |
| <th>Square Feet</th> | |
| <th>Price (Lakhs)</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>1</td> | |
| <td>800</td> | |
| <td>50</td> | |
| </tr> | |
| <tr> | |
| <td>2</td> | |
| <td>850</td> | |
| <td>52</td> | |
| </tr> | |
| <tr> | |
| <td>3</td> | |
| <td>900</td> | |
| <td>54</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>1500</td> | |
| <td>90</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>1600</td> | |
| <td>95</td> | |
| </tr> | |
| <tr> | |
| <td>6</td> | |
| <td>1700</td> | |
| <td>100</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Calculate Parent Variance</div> | |
| <div class="step-calculation"> | |
| Mean price = (50 + 52 + 54 + 90 + 95 + 100) / 6 | |
| = 441 / 6 | |
| = <strong style="color: #7ef0d4;">73.5 Lakhs</strong> | |
| Variance = Σ(yᵢ - mean)² / n | |
| Calculating each term: | |
| • (50 - 73.5)² = (-23.5)² = 552.25 | |
| • (52 - 73.5)² = (-21.5)² = 462.25 | |
| • (54 - 73.5)² = (-19.5)² = 380.25 | |
| • (90 - 73.5)² = (16.5)² = 272.25 | |
| • (95 - 73.5)² = (21.5)² = 462.25 | |
| • (100 - 73.5)² = (26.5)² = 702.25 | |
| Sum = 552.25 + 462.25 + 380.25 + 272.25 + 462.25 + 702.25 | |
| = 2831.5 | |
| Variance = 2831.5 / 6 = <strong style="color: #7ef0d4;">471.92</strong> | |
| <strong style="color: #6aa9ff;">✓ Parent Variance = 471.92</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Test Split Points</div> | |
| <div class="step-calculation"> | |
| Sort by Square Feet: 800, 850, 900, 1500, 1600, 1700 | |
| Possible midpoints: 825, 875, 1200, 1550, 1650 | |
| <strong style="color: #6aa9ff;">Testing Split at 1200:</strong> | |
| <strong>LEFT (Square Feet <= 1200):</strong> | |
| Samples: 800(50), 850(52), 900(54) | |
| Left Mean = (50 + 52 + 54) / 3 = 156 / 3 = <strong>52</strong> | |
| Left Variance: | |
| • (50 - 52)² = 4 | |
| • (52 - 52)² = 0 | |
| • (54 - 52)² = 4 | |
| Sum = 8 | |
| Variance = 8 / 3 = <strong>2.67</strong> | |
| <strong>RIGHT (Square Feet > 1200):</strong> | |
| Samples: 1500(90), 1600(95), 1700(100) | |
| Right Mean = (90 + 95 + 100) / 3 = 285 / 3 = <strong>95</strong> | |
| Right Variance: | |
| • (90 - 95)² = 25 | |
| • (95 - 95)² = 0 | |
| • (100 - 95)² = 25 | |
| Sum = 50 | |
| Variance = 50 / 3 = <strong>16.67</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Calculate Weighted Variance After Split</div> | |
| <div class="step-calculation"> | |
| Weighted Variance = (n_left/n_total) × Var_left + (n_right/n_total) × Var_right | |
| = (3/6) × 2.67 + (3/6) × 16.67 | |
| = 0.5 × 2.67 + 0.5 × 16.67 | |
| = 1.335 + 8.335 | |
| = <strong style="color: #6aa9ff;">9.67</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Calculate Variance Reduction</div> | |
| <div class="step-calculation"> | |
| Variance Reduction = Parent Variance - Weighted Variance After Split | |
| = 471.92 - 9.67 | |
| = <strong style="color: #7ef0d4; font-size: 18px;">462.25</strong> | |
| <strong style="color: #7ef0d4;">✓ This is the BEST SPLIT!</strong> | |
| Splitting at 1200 sq ft reduces variance by 462.25 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 5: Build Final Tree Structure</div> | |
| <div class="step-calculation"> | |
| Final Decision Tree: | |
| [All data, Mean=73.5, Var=471.92] | |
| │ | |
| Split at Square Feet = 1200 | |
| / \ | |
| <= 1200 > 1200 | |
| / \ | |
| Mean = 52 Split at 1550 | |
| (3 samples) / \ | |
| <= 1550 > 1550 | |
| / \ | |
| Mean = 90 Mean = 97.5 | |
| (1 sample) (2 samples) | |
| <strong style="color: #7ef0d4;">Prediction Example:</strong> | |
| New property: 950 sq ft | |
| ├─ 950 <= 1200? YES → Go LEFT | |
| └─ Prediction: <strong style="color: #7ef0d4; font-size: 18px;">₹52 Lakhs</strong> | |
| New property: 1650 sq ft | |
| ├─ 1650 <= 1200? NO → Go RIGHT | |
| ├─ 1650 <= 1550? NO → Go RIGHT | |
| └─ Prediction: <strong style="color: #7ef0d4; font-size: 18px;">₹97.5 Lakhs</strong> | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="dt-regression-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Decision tree regression with splits and | |
| predictions</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaway</div> | |
| <div class="callout-content"> | |
| Decision Tree Regression finds splits that minimize variance in leaf nodes. Each leaf | |
| predicts the mean of samples in that region. The recursive splitting creates a piecewise | |
| constant function! | |
| </div> | |
| </div> | |
| <h3>Variance Reduction vs Information Gain</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Aspect</th> | |
| <th>Classification Trees</th> | |
| <th>Regression Trees</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Splitting Criterion</td> | |
| <td>Information Gain (Entropy/Gini)</td> | |
| <td>Variance Reduction</td> | |
| </tr> | |
| <tr> | |
| <td>Prediction</td> | |
| <td>Majority class</td> | |
| <td>Mean value</td> | |
| </tr> | |
| <tr> | |
| <td>Leaf Node</td> | |
| <td>Class label</td> | |
| <td>Continuous value</td> | |
| </tr> | |
| <tr> | |
| <td>Goal</td> | |
| <td>Maximize purity</td> | |
| <td>Minimize variance</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="dt-splits-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Comparing different split points and their | |
| variance reduction</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 17: Decision Trees --> | |
| <div class="section" id="decision-trees"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 | |
| Supervised</span> Decision Trees</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Decision Trees make decisions by asking yes/no questions recursively. They're interpretable, | |
| powerful, and the foundation for ensemble methods like Random Forests!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Recursive partitioning of feature space</li> | |
| <li>Each node asks a yes/no question</li> | |
| <li>Leaves contain predictions</li> | |
| <li>Uses Information Gain or Gini Impurity for splitting</li> | |
| </ul> | |
| </div> | |
| <h3>How Decision Trees Work</h3> | |
| <p>Imagine you're playing "20 Questions" to guess an animal. Each question splits possibilities into | |
| two groups. Decision Trees work the same way!</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="decision-tree-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Interactive decision tree structure</p> | |
| </div> | |
| <h3>Splitting Criteria</h3> | |
| <p>How do we choose which question to ask at each node? We want splits that maximize information | |
| gain!</p> | |
| <h4>1. Entropy (Information Theory)</h4> | |
| <div class="formula"> | |
| <strong>Entropy Formula:</strong> | |
| H(S) = -Σ pᵢ × log₂(pᵢ)<br> | |
| <br> | |
| where pᵢ = proportion of class i<br> | |
| <br> | |
| <strong>Interpretation:</strong><br> | |
| • Entropy = 0: Pure (all same class)<br> | |
| • Entropy = 1: Maximum disorder (50-50 split)<br> | |
| • Lower entropy = better! | |
| </div> | |
| <h4>2. Information Gain</h4> | |
| <div class="formula"> | |
| <strong>Information Gain Formula:</strong> | |
| IG(S, A) = H(S) - Σ |Sᵥ|/|S| × H(Sᵥ)<br> | |
| <br> | |
| = Entropy before split - Weighted entropy after split<br> | |
| <br> | |
| <strong>We choose the split with HIGHEST information gain!</strong> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="entropy-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Entropy and Information Gain visualization | |
| </p> | |
| </div> | |
| <h4>3. Gini Impurity (Alternative)</h4> | |
| <div class="formula"> | |
| <strong>Gini Formula:</strong> | |
| Gini(S) = 1 - Σ pᵢ²<br> | |
| <br> | |
| <strong>Interpretation:</strong><br> | |
| • Gini = 0: Pure<br> | |
| • Gini = 0.5: Maximum impurity (binary)<br> | |
| • Faster to compute than entropy | |
| </div> | |
| <h3>Worked Example: Email Classification</h3> | |
| <p>Dataset: 10 emails - 7 spam, 3 not spam</p> | |
| <div class="callout info"> | |
| <div class="callout-title">📊 Calculating Information Gain</div> | |
| <div class="callout-content"> | |
| <strong>Initial Entropy:</strong><br> | |
| H(S) = -7/10×log₂(7/10) - 3/10×log₂(3/10)<br> | |
| H(S) = 0.881 bits<br> | |
| <br> | |
| <strong>Split by "Contains 'FREE'":</strong><br> | |
| • Left (5 emails): 4 spam, 1 not → H = 0.722<br> | |
| • Right (5 emails): 3 spam, 2 not → H = 0.971<br> | |
| <br> | |
| <strong>Weighted Entropy:</strong><br> | |
| = 5/10 × 0.722 + 5/10 × 0.971 = 0.847<br> | |
| <br> | |
| <strong>Information Gain:</strong><br> | |
| IG = 0.881 - 0.847 = 0.034 bits<br> | |
| <br> | |
| <strong>Split by "Has suspicious link":</strong><br> | |
| IG = 0.156 bits ← BETTER! Use this split! | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="split-comparison" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 3:</strong> Comparing different splits by information | |
| gain</p> | |
| </div> | |
| <h3>Decision Boundaries</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="tree-boundary" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 4:</strong> Decision tree creates rectangular regions | |
| </p> | |
| </div> | |
| <h3>Overfitting in Decision Trees</h3> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ The Overfitting Problem</div> | |
| <div class="callout-content"> | |
| Without constraints, decision trees grow until each leaf has ONE sample!<br> | |
| <br> | |
| <strong>Solutions:</strong><br> | |
| • <strong>Max depth:</strong> Limit tree height (e.g., max_depth=5)<br> | |
| • <strong>Min samples split:</strong> Need X samples to split (e.g., min=10)<br> | |
| • <strong>Min samples leaf:</strong> Each leaf must have X samples<br> | |
| • <strong>Pruning:</strong> Grow full tree, then remove branches | |
| </div> | |
| </div> | |
| <h3>Advantages vs Disadvantages</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Advantages ✅</th> | |
| <th>Disadvantages ❌</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Easy to understand and interpret</td> | |
| <td>Prone to overfitting</td> | |
| </tr> | |
| <tr> | |
| <td>No feature scaling needed</td> | |
| <td>Small changes → big tree changes</td> | |
| </tr> | |
| <tr> | |
| <td>Handles non-linear relationships</td> | |
| <td>Biased toward features with more levels</td> | |
| </tr> | |
| <tr> | |
| <td>Works with mixed data types</td> | |
| <td>Can't extrapolate beyond training data</td> | |
| </tr> | |
| <tr> | |
| <td>Fast prediction</td> | |
| <td>Less accurate than ensemble methods</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(255, 235, 59, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #ffeb3b; margin-top: 32px;"> | |
| <h3 style="color: #ffeb3b; margin-bottom: 20px;">📐 Complete Mathematical Derivation: Decision | |
| Tree Splitting</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's calculate Entropy, Information Gain, and | |
| Gini step-by-step!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Problem: Should we play tennis today?</strong><br><br> | |
| <strong>Training Data (14 days):</strong><br> | |
| • 9 days we played tennis (Yes)<br> | |
| • 5 days we didn't play (No)<br><br> | |
| <strong>Features:</strong> Weather (Sunny/Overcast/Rain), Wind (Weak/Strong) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Calculate Root Entropy H(S)</strong><br><br> | |
| <strong>Entropy Formula:</strong> H(S) = -Σ pᵢ × log₂(pᵢ)<br><br> | |
| p(Yes) = 9/14 = 0.643<br> | |
| p(No) = 5/14 = 0.357<br><br> | |
| H(S) = -[p(Yes) × log₂(p(Yes)) + p(No) × log₂(p(No))]<br> | |
| H(S) = -[0.643 × log₂(0.643) + 0.357 × log₂(0.357)]<br> | |
| H(S) = -[0.643 × (-0.637) + 0.357 × (-1.486)]<br> | |
| H(S) = -[-0.410 + (-0.531)]<br> | |
| H(S) = -[-0.940]<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">H(S) = 0.940 bits (before any | |
| split)</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Calculate Entropy After Splitting by | |
| "Wind"</strong><br><br> | |
| <strong>Split counts:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">Wind</th> | |
| <th style="padding: 10px;">Yes</th> | |
| <th style="padding: 10px;">No</th> | |
| <th style="padding: 10px;">Total</th> | |
| <th style="padding: 10px;">Entropy Calculation</th> | |
| <th style="padding: 10px;">H(subset)</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;"><strong>Weak</strong></td> | |
| <td style="text-align: center;">6</td> | |
| <td style="text-align: center;">2</td> | |
| <td style="text-align: center;">8</td> | |
| <td style="text-align: center;">-[6/8×log₂(6/8) + 2/8×log₂(2/8)]</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.811</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;"><strong>Strong</strong></td> | |
| <td style="text-align: center;">3</td> | |
| <td style="text-align: center;">3</td> | |
| <td style="text-align: center;">6</td> | |
| <td style="text-align: center;">-[3/6×log₂(3/6) + 3/6×log₂(3/6)]</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>1.000</strong></td> | |
| </tr> | |
| </table> | |
| <strong>Weighted Average Entropy:</strong><br> | |
| H(S|Wind) = (8/14) × 0.811 + (6/14) × 1.000<br> | |
| H(S|Wind) = 0.463 + 0.429<br> | |
| <strong style="color: #7ef0d4;">H(S|Wind) = 0.892</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Calculate Information Gain</strong><br><br> | |
| <strong>Formula:</strong> IG(S, Feature) = H(S) - H(S|Feature)<br><br> | |
| IG(S, Wind) = 0.940 - 0.892<br> | |
| <strong style="color: #7ef0d4; font-size: 18px;">IG(S, Wind) = 0.048 bits</strong><br><br> | |
| <em style="color: #a9b4c2;">This means splitting by Wind reduces uncertainty by 0.048 | |
| bits</em> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Compare with Other Features</strong><br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 10px;">Feature</th> | |
| <th style="padding: 10px;">H(S|Feature)</th> | |
| <th style="padding: 10px;">Information Gain</th> | |
| <th style="padding: 10px;">Decision</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 8px; text-align: center;">Weather</td> | |
| <td style="text-align: center;">0.693</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.247</strong></td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>✓ BEST!</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 8px; text-align: center;">Wind</td> | |
| <td style="text-align: center;">0.892</td> | |
| <td style="text-align: center;">0.048</td> | |
| <td style="text-align: center;"></td> | |
| </tr> | |
| </table> | |
| <strong style="color: #7ef0d4; font-size: 16px;">→ Split by "Weather" first (highest | |
| information gain!)</strong> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 5: Gini Impurity Alternative</strong><br><br> | |
| <strong>Gini Formula:</strong> Gini(S) = 1 - Σ pᵢ²<br><br> | |
| <strong>For root node:</strong><br> | |
| Gini(S) = 1 - [(9/14)² + (5/14)²]<br> | |
| Gini(S) = 1 - [0.413 + 0.128]<br> | |
| Gini(S) = 1 - 0.541<br> | |
| <strong style="color: #7ef0d4;">Gini(S) = 0.459</strong><br><br> | |
| <strong>Interpretation:</strong><br> | |
| • Gini = 0: Pure node (all same class)<br> | |
| • Gini = 0.5: Maximum impurity (50-50 split)<br> | |
| • Our 0.459 indicates moderate impurity | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ Summary: Decision Tree Math</div> | |
| <div class="callout-content"> | |
| <strong>The algorithm at each node:</strong><br> | |
| 1. Calculate parent entropy/Gini<br> | |
| 2. For each feature:<br> | |
| • Split data by feature values<br> | |
| • Calculate weighted child entropy/Gini<br> | |
| • Compute Information Gain = Parent - Weighted Children<br> | |
| 3. Choose feature with <strong style="color: #7ef0d4;">HIGHEST Information | |
| Gain</strong><br> | |
| 4. Repeat recursively until stopping criteria met! | |
| </div> | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.tree <span style="color: #ff8c6a;">import</span> DecisionTreeClassifier | |
| <span style="color: #ff8c6a;">from</span> sklearn <span style="color: #ff8c6a;">import</span> tree | |
| <span style="color: #ff8c6a;">import</span> matplotlib.pyplot <span style="color: #ff8c6a;">as</span> plt | |
| <span style="color: #6aa9ff;"># Create Decision Tree</span> | |
| dt = DecisionTreeClassifier( | |
| criterion=<span style="color: #7ef0d4;">'gini'</span>, <span style="color: #6aa9ff;"># 'gini' or 'entropy'</span> | |
| max_depth=<span style="color: #7ef0d4;">5</span>, <span style="color: #6aa9ff;"># Limit depth (prevent overfitting)</span> | |
| min_samples_split=<span style="color: #7ef0d4;">2</span>, <span style="color: #6aa9ff;"># Min samples to split</span> | |
| min_samples_leaf=<span style="color: #7ef0d4;">1</span> <span style="color: #6aa9ff;"># Min samples in leaf</span> | |
| ) | |
| <span style="color: #6aa9ff;"># Train</span> | |
| dt.fit(X_train, y_train) | |
| <span style="color: #6aa9ff;"># Predict</span> | |
| predictions = dt.predict(X_test) | |
| <span style="color: #6aa9ff;"># Visualize the tree</span> | |
| plt.figure(figsize=(<span style="color: #7ef0d4;">20</span>, <span style="color: #7ef0d4;">10</span>)) | |
| tree.plot_tree(dt, filled=<span style="color: #7ef0d4;">True</span>, feature_names=feature_names) | |
| plt.show() | |
| <span style="color: #6aa9ff;"># Feature importance</span> | |
| <span style="color: #ff8c6a;">print</span>(dict(<span style="color: #ff8c6a;">zip</span>(feature_names, dt.feature_importances_)))</pre> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- REINFORCEMENT LEARNING SECTIONS --> | |
| <!-- Section: RL Introduction --> | |
| <div class="section" id="rl-intro"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🎮 | |
| Reinforcement</span> Introduction to Reinforcement Learning</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Reinforcement Learning (RL) is learning by trial and error, just like teaching a dog tricks! The | |
| agent takes actions in an environment, receives rewards or punishments, and learns which actions | |
| lead to the best outcomes.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Agent: The learner/decision maker</li> | |
| <li>Environment: The world the agent interacts with</li> | |
| <li>State: Current situation of the agent</li> | |
| <li>Action: What the agent can do</li> | |
| <li>Reward: Feedback signal (positive or negative)</li> | |
| <li>Policy: Strategy the agent follows</li> | |
| </ul> | |
| </div> | |
| <h3>The RL Loop</h3> | |
| <ol> | |
| <li><strong>Observe state:</strong> Agent sees current situation</li> | |
| <li><strong>Choose action:</strong> Based on policy π(s)</li> | |
| <li><strong>Execute action:</strong> Interact with environment</li> | |
| <li><strong>Receive reward:</strong> Get feedback r</li> | |
| <li><strong>Transition to new state:</strong> Environment changes to s'</li> | |
| <li><strong>Learn and update:</strong> Improve policy</li> | |
| </ol> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Key Difference from Supervised Learning</div> | |
| <div class="callout-content"> | |
| <strong>Supervised:</strong> "Here's the right answer for each example"<br> | |
| <strong>Reinforcement:</strong> "Try things and I'll tell you if you did well or poorly"<br> | |
| <br> | |
| RL must explore to discover good actions, while supervised learning is given correct answers | |
| upfront! | |
| </div> | |
| </div> | |
| <h3>Real-World Examples</h3> | |
| <ul> | |
| <li><strong>Game Playing:</strong> AlphaGo learning to play Go by playing millions of games</li> | |
| <li><strong>Robotics:</strong> Robot learning to walk by trying different leg movements</li> | |
| <li><strong>Self-Driving Cars:</strong> Learning to drive safely through experience</li> | |
| <li><strong>Recommendation Systems:</strong> Learning what users like from their interactions | |
| </li> | |
| <li><strong>Resource Management:</strong> Optimizing data center cooling to save energy</li> | |
| </ul> | |
| <h3>Exploration vs Exploitation</h3> | |
| <p>The fundamental dilemma in RL:</p> | |
| <ul> | |
| <li><strong>Exploration:</strong> Try new actions to discover better rewards</li> | |
| <li><strong>Exploitation:</strong> Use known good actions to maximize reward</li> | |
| </ul> | |
| <p>Balance is key! Too much exploration wastes time on bad actions. Too much exploitation misses | |
| better strategies.</p> | |
| <div class="formula"> | |
| <strong>Reward Signal:</strong> | |
| Total Return = R = r₁ + γr₂ + γ²r₃ + ... = Σ γᵗ rᵗ₊₁ | |
| <br><small>where:<br>γ = discount factor (0 ≤ γ ≤ 1)<br>Future rewards are worth less than | |
| immediate rewards</small> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: Q-Learning --> | |
| <div class="section" id="q-learning"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🎮 | |
| Reinforcement</span> Q-Learning</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Q-Learning is a value-based RL algorithm that learns the quality (Q-value) of taking each action | |
| in each state. It's model-free and can learn optimal policies even without knowing how the | |
| environment works!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Q-value: Expected future reward for action a in state s</li> | |
| <li>Q-table: Stores Q-values for all state-action pairs</li> | |
| <li>Off-policy: Can learn optimal policy while following exploratory policy</li> | |
| <li>Temporal Difference: Learn from each step, not just end of episode</li> | |
| </ul> | |
| </div> | |
| <div class="formula"> | |
| <strong>Q-Learning Update Rule:</strong> | |
| Q(s, a) ← Q(s, a) + α[r + γ · max Q(s', a') - Q(s, a)] | |
| <br><br> | |
| Breaking it down:<br> | |
| Q(s, a) = Current Q-value estimate<br> | |
| α = Learning rate (e.g., 0.1)<br> | |
| r = Immediate reward received<br> | |
| γ = Discount factor (e.g., 0.9)<br> | |
| max Q(s', a') = Best Q-value in next state<br> | |
| [r + γ · max Q(s', a') - Q(s, a)] = TD error (how wrong we were) | |
| </div> | |
| <h3>Step-by-Step Example: Grid World Navigation</h3> | |
| <p><strong>Problem:</strong> Agent navigates 3x3 grid to reach goal at (2,2)</p> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Initialize Q-Table</div> | |
| <div class="step-calculation"> | |
| States: 9 positions (0,0) to (2,2)<br> | |
| Actions: 4 directions (Up, Down, Left, Right)<br> | |
| <br> | |
| Q-table: 9 × 4 = 36 values, all initialized to 0<br> | |
| <br> | |
| Example entry: Q((1,1), Right) = 0.0 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Episode 1 - Random Exploration</div> | |
| <div class="step-calculation"> | |
| Start: s = (0,0)<br> | |
| <br> | |
| <strong>Step 1:</strong> Choose action a = Right (ε-greedy)<br> | |
| Execute: Move to s' = (0,1)<br> | |
| Reward: r = -1 (penalty for each step)<br> | |
| <br> | |
| Update Q((0,0), Right):<br> | |
| Q = 0 + 0.1[-1 + 0.9 × max(0, 0, 0, 0) - 0]<br> | |
| Q = 0 + 0.1[-1]<br> | |
| Q((0,0), Right) = <strong>-0.1</strong> ✓<br> | |
| <br> | |
| <strong>Step 2:</strong> s = (0,1), action = Down<br> | |
| s' = (1,1), r = -1<br> | |
| Q((0,1), Down) = 0 + 0.1[-1 + 0] = <strong>-0.1</strong><br> | |
| <br> | |
| <strong>Step 3:</strong> s = (1,1), action = Right<br> | |
| s' = (1,2), r = -1<br> | |
| Q((1,1), Right) = <strong>-0.1</strong><br> | |
| <br> | |
| <strong>Step 4:</strong> s = (1,2), action = Down<br> | |
| s' = (2,2) ← <span style="color: #7ef0d4;">GOAL!</span><br> | |
| r = +100 (big reward!)<br> | |
| <br> | |
| Q((1,2), Down) = 0 + 0.1[100 + 0]<br> | |
| Q((1,2), Down) = <strong style="color: #7ef0d4;">10.0</strong> ✓✓✓ | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Episode 2 - Learning Propagates Backward</div> | |
| <div class="step-calculation"> | |
| Path: (0,0) → (0,1) → (1,1) → (1,2) → (2,2)<br> | |
| <br> | |
| At (1,1), choosing Right:<br> | |
| Q((1,1), Right) = -0.1 + 0.1[-1 + 0.9 × 10.0 - (-0.1)]<br> | |
| = -0.1 + 0.1[-1 + 9.0 + 0.1]<br> | |
| = -0.1 + 0.1[8.1]<br> | |
| = -0.1 + 0.81<br> | |
| Q((1,1), Right) = <strong style="color: #7ef0d4;">0.71</strong> ✓<br> | |
| <br> | |
| <span style="color: #7ef0d4;">→ The value of being near the goal propagates backward!</span> | |
| </div> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ After Many Episodes</div> | |
| <div class="callout-content"> | |
| The Q-table converges to optimal values:<br> | |
| <br> | |
| Q((0,0), Right) ≈ 7.3<br> | |
| Q((1,1), Right) ≈ 8.1<br> | |
| Q((1,2), Down) ≈ 9.0<br> | |
| <br> | |
| <strong>Optimal Policy:</strong> Always move toward (2,2) via shortest path!<br> | |
| Agent has learned to navigate perfectly through trial and error. | |
| </div> | |
| </div> | |
| <h3>ε-Greedy Policy</h3> | |
| <div class="formula"> | |
| <strong>Action Selection:</strong><br> | |
| With probability ε: Choose random action (explore)<br> | |
| With probability 1-ε: Choose argmax Q(s,a) (exploit)<br> | |
| <br> | |
| Common: Start ε=1.0, decay to ε=0.01 over time | |
| </div> | |
| <h3>Advantages</h3> | |
| <ul> | |
| <li>✓ Simple to implement</li> | |
| <li>✓ Guaranteed to converge to optimal policy</li> | |
| <li>✓ Model-free (doesn't need environment model)</li> | |
| <li>✓ Off-policy (learn from exploratory behavior)</li> | |
| </ul> | |
| <h3>Disadvantages</h3> | |
| <ul> | |
| <li>✗ Doesn't scale to large/continuous state spaces</li> | |
| <li>✗ Slow convergence in complex environments</li> | |
| <li>✗ Requires discrete actions</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- Section: Policy Gradient --> | |
| <div class="section" id="policy-gradient"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🎮 | |
| Reinforcement</span> Policy Gradient Methods</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Policy Gradient methods directly optimize the policy (action selection strategy) instead of | |
| learning value functions. They're powerful for continuous action spaces and stochastic policies! | |
| </p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Direct policy optimization: Learn πᵧ(a|s) directly</li> | |
| <li>Parameterized policy: Use neural network with weights θ</li> | |
| <li>Gradient ascent: Move parameters to maximize expected reward</li> | |
| <li>Works with continuous actions: Can output action distributions</li> | |
| </ul> | |
| </div> | |
| <h3>Policy vs Value-Based Methods</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Aspect</th> | |
| <th>Value-Based (Q-Learning)</th> | |
| <th>Policy-Based</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>What it learns</td> | |
| <td>Q(s,a) values</td> | |
| <td>π(a|s) policy directly</td> | |
| </tr> | |
| <tr> | |
| <td>Action selection</td> | |
| <td>argmax Q(s,a)</td> | |
| <td>Sample from π(a|s)</td> | |
| </tr> | |
| <tr> | |
| <td>Continuous actions</td> | |
| <td>Difficult</td> | |
| <td>Natural</td> | |
| </tr> | |
| <tr> | |
| <td>Stochastic policy</td> | |
| <td>Indirect</td> | |
| <td>Direct</td> | |
| </tr> | |
| <tr> | |
| <td>Convergence</td> | |
| <td>Can be unstable</td> | |
| <td>Smoother</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="formula"> | |
| <strong>Policy Gradient Theorem:</strong> | |
| ∇ᵧ J(θ) = Eᵧ[∇ᵧ log πᵧ(a|s) · Qᵧ(s,a)] | |
| <br><br> | |
| Practical form (REINFORCE):<br> | |
| ∇ᵧ J(θ) ≈ ∇ᵧ log πᵧ(aᵗ|sᵗ) · Gᵗ<br> | |
| <br> | |
| where:<br> | |
| Gᵗ = Total return from time t onward<br> | |
| πᵧ(a|s) = Probability of action a in state s<br> | |
| θ = Policy parameters (neural network weights) | |
| </div> | |
| <h3>REINFORCE Algorithm (Monte Carlo Policy Gradient)</h3> | |
| <div class="step"> | |
| <div class="step-title">Algorithm Steps</div> | |
| <div class="step-calculation"> | |
| <strong>1. Initialize:</strong> Random policy parameters θ<br> | |
| <br> | |
| <strong>2. For each episode:</strong><br> | |
| a. Generate trajectory: s₀, a₀, r₁, s₁, a₁, r₂, ..., sₜ<br> | |
| b. For each time step t:<br> | |
| - Calculate return: Gᵗ = rᵗ₊₁ + γrᵗ₊₂ + γ²rᵗ₊₃ + ...<br> | |
| - Update: θ ← θ + α · Gᵗ · ∇ᵧ log πᵧ(aᵗ|sᵗ)<br> | |
| <br> | |
| <strong>3. Repeat</strong> until policy converges | |
| </div> | |
| </div> | |
| <h3>Example: CartPole Balancing</h3> | |
| <p><strong>Problem:</strong> Balance a pole on a cart by moving left or right</p> | |
| <div class="step"> | |
| <div class="step-title">Episode Example</div> | |
| <div class="step-calculation"> | |
| State: s = [cart_pos, cart_vel, pole_angle, pole_vel]<br> | |
| Actions: a ∈ {Left, Right}<br> | |
| <br> | |
| <strong>Time t=0:</strong><br> | |
| s₀ = [0.0, 0.0, 0.1, 0.0] (pole leaning right)<br> | |
| π(Left|s₀) = 0.3, π(Right|s₀) = 0.7<br> | |
| Sample action: a₀ = Right<br> | |
| Reward: r₁ = +1 (pole still balanced)<br> | |
| <br> | |
| <strong>Time t=1:</strong><br> | |
| s₁ = [0.05, 0.1, 0.08, -0.05]<br> | |
| Action: a₁ = Right<br> | |
| r₂ = +1<br> | |
| <br> | |
| ... episode continues for T=200 steps ...<br> | |
| <br> | |
| <strong>Total return:</strong> G = 200 (balanced entire episode!)<br> | |
| <br> | |
| <strong>Update policy:</strong><br> | |
| For each (sᵗ, aᵗ) in trajectory:<br> | |
| θ ← θ + 0.01 × 200 × ∇ log π(aᵗ|sᵗ)<br> | |
| <br> | |
| → Increase probability of all actions taken in this successful episode! | |
| </div> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Why It Works</div> | |
| <div class="callout-content"> | |
| <strong>Good episode (high G):</strong> Increase probability of actions taken<br> | |
| <strong>Bad episode (low G):</strong> Decrease probability of actions taken<br> | |
| <br> | |
| Over many episodes, the policy learns which actions lead to better outcomes! | |
| </div> | |
| </div> | |
| <h3>Advantages</h3> | |
| <ul> | |
| <li>✓ Works with continuous action spaces</li> | |
| <li>✓ Can learn stochastic policies</li> | |
| <li>✓ Better convergence properties</li> | |
| <li>✓ Effective in high-dimensional spaces</li> | |
| </ul> | |
| <h3>Disadvantages</h3> | |
| <ul> | |
| <li>✗ High variance in gradient estimates</li> | |
| <li>✗ Sample inefficient (needs many episodes)</li> | |
| <li>✗ Can get stuck in local optima</li> | |
| <li>✗ Sensitive to learning rate</li> | |
| </ul> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Modern Improvements</div> | |
| <div class="callout-content"> | |
| <strong>Actor-Critic:</strong> Combine policy gradient with value function to reduce | |
| variance<br> | |
| <strong>PPO (Proximal Policy Optimization):</strong> Constrain policy updates for | |
| stability<br> | |
| <strong>TRPO (Trust Region):</strong> Guarantee monotonic improvement<br> | |
| <br> | |
| These advances make policy gradients practical for complex tasks like robot control and game | |
| playing! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ======================================== | |
| NLP & GENAI SECTIONS (Module 13, 16-19) | |
| ======================================== --> | |
| <!-- Section 14: NLP Preprocessing --> | |
| <div class="section" id="nlp-preprocessing"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(232, 238, 246, 0.3); color: #e8eef6;">🗣️ NLP - | |
| Basic</span> Text Preprocessing</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Before machine learning models can process human language, the text must be cleaned and converted | |
| into a numerical format. This process is called <strong>Text Preprocessing</strong>.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Core Techniques (Module 13)</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Tokenization:</strong> Splitting text into individual words or tokens.</li> | |
| <li><strong>Stopwords Removal:</strong> Removing common words (the, is, at) that don't add | |
| much meaning.</li> | |
| <li><strong>Stemming/Lemmatization:</strong> Reducing words to their root form (e.g., | |
| "running" → "run").</li> | |
| <li><strong>POS Tagging:</strong> Identifying grammatical parts of speech (Nouns, Verbs, | |
| etc.).</li> | |
| </ul> | |
| </div> | |
| <h3>Paper & Pen Example: Tokenization & Stopwords</h3> | |
| <div class="step"> | |
| <div class="step-title">Step 1: Input Raw Text</div> | |
| <div class="step-calculation">Text: "The quick brown fox is jumping over the lazy dog."</div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Step 2: Remove Stopwords (NLTK English list)</div> | |
| <div class="step-calculation">Stopwords: "The", "is", "over", "the" | |
| Cleaned: "quick", "brown", "fox", "jumping", "lazy", "dog"</div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Step 3: Lemmatization</div> | |
| <div class="step-calculation">"jumping" → "jump" | |
| Root tokens: ["quick", "brown", "fox", "jump", "lazy", "dog"]</div> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Why POS Tagging Matters?</div> | |
| <div class="callout-content"> | |
| Part-of-Speech tagging helps models understand context. For example, "Bank" can be a | |
| <strong>Noun</strong> (river bank) or a <strong>Verb</strong> (to bank on someone). | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 15: Word Embeddings --> | |
| <div class="section" id="word-embeddings"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(232, 238, 246, 0.3); color: #e8eef6;">🗣️ NLP - | |
| Embeddings</span> Word Embeddings (Word2Vec)</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Word Embeddings are dense vector representations of words where words with similar meanings are | |
| indexed closer to each other in vector space. <strong>Word2Vec</strong> is a seminal algorithm | |
| for this.</p> | |
| <div class="formula"> | |
| <strong>Word Similarity (Cosine Similarity):</strong> | |
| cos(θ) = (A · B) / (||A|| ||B||) | |
| <br><small>Measures the angle between two word vectors. Closer to 1 = more similar.</small> | |
| </div> | |
| <h3>Working Intuition</h3> | |
| <p>Word2Vec learns context by predicting a word from its neighbors (CBOW) or predicting neighbors | |
| from a word (Skip-gram). This captures semantic relationships like:</p> | |
| <div class="callout success"> | |
| <div class="callout-title">✓ Vector Mathematics</div> | |
| <div class="callout-content"> | |
| <strong>King - Man + Woman ≈ Queen</strong> | |
| <br>The model learns that the relational "distance" between King and Man is similar to that | |
| between Queen and Woman. | |
| </div> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-title">Implementation Resources</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Google Colab:</strong> <a | |
| href="https://colab.research.google.com/drive/1cIg1tqNvU0Cl9E7-grrF7l8FQj-_efxF" | |
| target="_blank" style="color: #7ef0d4;">NLP for ML Practice Notebook</a></li> | |
| <li><strong>GitHub Repo:</strong> <a href="https://github.com/sourangshupal/NLP-for-ML" | |
| target="_blank" style="color: #7ef0d4;">sourangshupal/NLP-for-ML</a></li> | |
| <li><strong>Kaggle Dataset:</strong> <a | |
| href="https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" | |
| target="_blank" style="color: #7ef0d4;">IMDB 50K Movie Reviews</a></li> | |
| <li><strong>Research Paper:</strong> <a href="https://arxiv.org/abs/1301.3781" | |
| target="_blank" style="color: #7ef0d4;">Word2Vec (Mikolov et al.)</a></li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 16: Advanced NLP --> | |
| <div class="section" id="rnn-lstm"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">🧠 | |
| Strategic</span> RNN & LSTM</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Recurrent Neural Networks (RNNs) are designed for sequential data (like text). They have "memory" | |
| that allows information to persist.</p> | |
| <div class="callout warning"> | |
| <div class="callout-title">⚠️ Vanishing Gradient Problem</div> | |
| <div class="callout-content"> | |
| Standard RNNs struggle to remember long-term dependencies. <strong>LSTM (Long Short-Term | |
| Memory)</strong> units were designed to fix this using "gates" that control information | |
| flow. | |
| </div> | |
| </div> | |
| <h3>LSTM Architecture</h3> | |
| <p>An LSTM neuron has three main gates:</p> | |
| <ul> | |
| <li><strong>Forget Gate:</strong> Decides which info to discard.</li> | |
| <li><strong>Input Gate:</strong> Decides which new info to store.</li> | |
| <li><strong>Output Gate:</strong> Decides which part of the memory to output.</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- Section 17: Transformers --> | |
| <div class="section" id="transformers"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">🧠 | |
| Strategic</span> Transformers</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>The Transformer architecture (from "Attention is All You Need") revolutionized NLP by removing | |
| recurrence and using <strong>Self-Attention</strong> mechanisms.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Advantages</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Parallelization:</strong> Unlike RNNs, Transformers can process entire sentences | |
| at once.</li> | |
| <li><strong>Global Context:</strong> Self-attention allows the model to look at every word | |
| in a sentence simultaneously.</li> | |
| <li><strong>Foundation for LLMs:</strong> This architecture powers BERT, GPT, and Claude. | |
| </li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 18: Generative AI & LLMs --> | |
| <div class="section" id="genai-intro"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🪄 | |
| GenAI</span> Generative AI & LLMs</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Generative AI refers to models that can create new content (text, images, code). Large Language | |
| Models (LLMs) are the pinnacle of this for text.</p> | |
| <h3>Training Paradigm</h3> | |
| <ol> | |
| <li><strong>Pre-training:</strong> Predict next word on massive datasets (Internet scale).</li> | |
| <li><strong>Fine-tuning:</strong> Adapting the model to specific tasks (e.g., chat, coding). | |
| </li> | |
| <li><strong>RLHF:</strong> Reinforcement Learning from Human Feedback to align model behavior. | |
| </li> | |
| </ol> | |
| </div> | |
| </div> | |
| <!-- Section 19: VectorDB & RAG --> | |
| <div class="section" id="vectordb-rag"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(255, 140, 106, 0.3); color: #ff8c6a;">🪄 | |
| GenAI</span> VectorDB & RAG</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>To ground LLMs in private or fresh data, we use <strong>Retrieval-Augmented Generation | |
| (RAG)</strong>.</p> | |
| <div class="step"> | |
| <div class="step-title">The RAG Workflow</div> | |
| <div class="step-calculation">1. <strong>Chunk:</strong> Break documents into small pieces. | |
| 2. <strong>Embed:</strong> Convert chunks into vectors. | |
| 3. <strong>Store:</strong> Save vectors in a <strong>Vector Database</strong> (Pinecone, | |
| Milvus, Chroma). | |
| 4. <strong>Retrieve:</strong> Find relevant chunks for a user query. | |
| 5. <strong>Generate:</strong> Pass chunks to LLM as context for the answer.</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 20: Algorithm Comparison Tool --> | |
| <div class="section" id="algorithm-comparison"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔄 | |
| Comparison</span> Algorithm Comparison Tool</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Compare machine learning algorithms side-by-side to choose the best one for your problem!</p> | |
| <!-- Step 1: Select Category --> | |
| <div class="info-card" style="background: var(--color-bg-1);"> | |
| <h3 style="margin-bottom: 16px; color: var(--color-text);">Step 1: Select Learning Category</h3> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="category" value="all" checked> Show All</label> | |
| <label><input type="radio" name="category" value="supervised"> Supervised Learning</label> | |
| <label><input type="radio" name="category" value="unsupervised"> Unsupervised | |
| Learning</label> | |
| </div> | |
| </div> | |
| <!-- Step 2: Select Algorithms --> | |
| <div class="info-card" style="background: var(--color-bg-2); margin-top: 24px;"> | |
| <h3 style="margin-bottom: 16px; color: var(--color-text);">Step 2: Select Algorithms to Compare | |
| (2-5)</h3> | |
| <div id="algorithm-checkboxes" | |
| style="display: grid; grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); gap: 12px;"> | |
| <!-- Populated by JavaScript --> | |
| </div> | |
| <p id="selection-count" | |
| style="margin-top: 12px; color: var(--color-text-secondary); font-size: 14px;">Selected: 0 | |
| algorithms</p> | |
| </div> | |
| <!-- Step 3: Compare Button --> | |
| <div style="text-align: center; margin: 32px 0;"> | |
| <button class="btn btn--primary" id="compare-btn" style="padding: 14px 48px; font-size: 16px;" | |
| disabled>Compare Algorithms</button> | |
| </div> | |
| <!-- Comparison Results Container --> | |
| <div id="comparison-results" style="display: none;"> | |
| <!-- View Selector --> | |
| <div | |
| style="display: flex; gap: 12px; margin-bottom: 24px; flex-wrap: wrap; justify-content: center;"> | |
| <button class="btn btn--secondary view-btn active" data-view="table">📊 Table View</button> | |
| <button class="btn btn--secondary view-btn" data-view="radar">🎯 Radar Chart</button> | |
| <button class="btn btn--secondary view-btn" data-view="heatmap">🔥 Heatmap</button> | |
| <button class="btn btn--secondary view-btn" data-view="decision">🌳 Decision Tree</button> | |
| <button class="btn btn--secondary view-btn" data-view="matrix">📋 Use Case Matrix</button> | |
| </div> | |
| <!-- View: Table --> | |
| <div class="comparison-view" id="view-table"> | |
| <h3 style="margin-bottom: 20px; text-align: center;">Side-by-Side Comparison</h3> | |
| <div style="overflow-x: auto;"> | |
| <table class="data-table" id="comparison-table"> | |
| <!-- Populated by JavaScript --> | |
| </table> | |
| </div> | |
| </div> | |
| <!-- View: Radar Chart --> | |
| <div class="comparison-view" id="view-radar" style="display: none;"> | |
| <h3 style="margin-bottom: 20px; text-align: center;">Visual Performance Comparison</h3> | |
| <p style="text-align: center; color: var(--color-text-secondary); margin-bottom: 16px;"> | |
| Interactive radar chart - works in all modern browsers</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 500px;"> | |
| <canvas id="radar-comparison-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- View: Heatmap --> | |
| <div class="comparison-view" id="view-heatmap" style="display: none;"> | |
| <!-- HTML table-based heatmap - works in ALL browsers (Chrome, Firefox, Safari, Edge, Mobile) --> | |
| <p style="text-align: center; color: var(--color-text-secondary); margin-bottom: 16px;">✓ | |
| HTML table-based visualization - 100% browser compatible</p> | |
| </div> | |
| <!-- View: Decision Tree --> | |
| <div class="comparison-view" id="view-decision" style="display: none;"> | |
| <h3 style="margin-bottom: 20px; text-align: center;">When to Use Which Algorithm?</h3> | |
| <div | |
| style="background: var(--color-surface); padding: 32px; border-radius: 12px; border: 1px solid var(--color-border);"> | |
| <pre | |
| style="font-family: monospace; font-size: 13px; line-height: 1.8; color: var(--color-text); background: transparent; border: none; padding: 0; margin: 0; white-space: pre-wrap;">What's your use case? | |
| ├─ I have <strong style="color: #7ef0d4;">LABELED</strong> data | |
| │ ├─ Predict <strong style="color: #6aa9ff;">NUMBERS</strong> (Regression) | |
| │ │ ├─ Linear relationship? → <strong style="color: #7ef0d4;">Linear Regression</strong> | |
| │ │ └─ Complex patterns? → <strong style="color: #7ef0d4;">Random Forest / XGBoost</strong> | |
| │ │ | |
| │ ├─ Predict <strong style="color: #6aa9ff;">CATEGORIES</strong> (Classification) | |
| │ │ ├─ Want interpretability? → <strong style="color: #7ef0d4;">Decision Trees / Naive Bayes</strong> | |
| │ │ ├─ Want best accuracy? → <strong style="color: #7ef0d4;">SVM / Random Forest</strong> | |
| │ │ ├─ Want speed? → <strong style="color: #7ef0d4;">Logistic Regression / Naive Bayes</strong> | |
| │ │ ├─ Have few samples? → <strong style="color: #7ef0d4;">Naive Bayes</strong> | |
| │ │ └─ Local patterns? → <strong style="color: #7ef0d4;">KNN</strong> | |
| │ | |
| ├─ I have <strong style="color: #ff8c6a;">UNLABELED</strong> data | |
| │ ├─ Want to group similar items? → <strong style="color: #7ef0d4;">K-means</strong> | |
| │ ├─ Want to reduce dimensions? → <strong style="color: #7ef0d4;">PCA</strong> | |
| │ └─ Unknown number of groups? → <strong style="color: #7ef0d4;">DBSCAN</strong> | |
| │ | |
| └─ I want to <strong style="color: #ffb490;">LEARN from experience</strong> | |
| └─ Use <strong style="color: #7ef0d4;">Reinforcement Learning</strong></pre> | |
| </div> | |
| </div> | |
| <!-- View: Use Case Matrix --> | |
| <div class="comparison-view" id="view-matrix" style="display: none;"> | |
| <h3 style="margin-bottom: 20px; text-align: center;">Use Case Suitability Matrix</h3> | |
| <div style="overflow-x: auto;"> | |
| <table class="data-table" id="matrix-table"> | |
| <!-- Populated by JavaScript --> | |
| </table> | |
| </div> | |
| </div> | |
| <!-- Detailed Comparison Cards --> | |
| <div id="detailed-cards" style="margin-top: 48px;"> | |
| <!-- Populated by JavaScript --> | |
| </div> | |
| </div> | |
| <!-- Algorithm Quiz --> | |
| <div class="info-card" style="background: var(--color-bg-5); margin-top: 48px;"> | |
| <h3 style="margin-bottom: 20px; color: var(--color-text);">🎯 Not Sure Which Algorithm? Take the | |
| Quiz!</h3> | |
| <div id="quiz-container"> | |
| <div class="quiz-question" id="quiz-q1"> | |
| <p style="font-weight: 600; margin-bottom: 12px;">Question 1: Do you have labeled data? | |
| </p> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="q1" value="yes"> Yes</label> | |
| <label><input type="radio" name="q1" value="no"> No</label> | |
| </div> | |
| </div> | |
| <div class="quiz-question" id="quiz-q2" style="display: none; margin-top: 20px;"> | |
| <p style="font-weight: 600; margin-bottom: 12px;">Question 2: What do you want to | |
| predict?</p> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="q2" value="numbers"> Numbers (Regression)</label> | |
| <label><input type="radio" name="q2" value="categories"> Categories | |
| (Classification)</label> | |
| <label><input type="radio" name="q2" value="groups"> Groups (Clustering)</label> | |
| </div> | |
| </div> | |
| <div class="quiz-question" id="quiz-q3" style="display: none; margin-top: 20px;"> | |
| <p style="font-weight: 600; margin-bottom: 12px;">Question 3: How much training data do | |
| you have?</p> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="q3" value="little"> Very Little (<100 | |
| samples)</label> | |
| <label><input type="radio" name="q3" value="some"> Some (100-10k samples)</label> | |
| <label><input type="radio" name="q3" value="lots"> Lots (>10k samples)</label> | |
| </div> | |
| </div> | |
| <div class="quiz-question" id="quiz-q4" style="display: none; margin-top: 20px;"> | |
| <p style="font-weight: 600; margin-bottom: 12px;">Question 4: Is interpretability | |
| important?</p> | |
| <div class="radio-group"> | |
| <label><input type="radio" name="q4" value="very"> Very Important</label> | |
| <label><input type="radio" name="q4" value="somewhat"> Somewhat Important</label> | |
| <label><input type="radio" name="q4" value="not"> Not Important</label> | |
| </div> | |
| </div> | |
| <div id="quiz-result" | |
| style="display: none; margin-top: 24px; padding: 20px; background: var(--color-bg-3); border-radius: 8px; border-left: 4px solid var(--color-success);"> | |
| <!-- Result populated by JavaScript --> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 19a-NEW: Gradient Boosting Classification --> | |
| <div class="section" id="gradient-boosting-classification"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> Gradient Boosting Classification</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Gradient Boosting for classification predicts probabilities using sequential trees that minimize | |
| log loss. Each tree corrects the previous model's errors by fitting to gradients!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Simple Math Breakdown</div> | |
| <ul class="info-card-list"> | |
| <li>Step 1: Start with log-odds F(0) = log(pos/neg)</li> | |
| <li>Step 2: Calculate gradient g = p - y</li> | |
| <li>Step 3: Build tree on gradients</li> | |
| <li>Step 4: Update F(x) = F(0) + lr × tree</li> | |
| <li>Step 5: Repeat to minimize errors</li> | |
| </ul> | |
| </div> | |
| <div class="formula"> | |
| <strong>Simple Explanation:</strong><br> | |
| Step 1: F(0) = log(positive_count / negative_count)<br> | |
| Step 2: g = p - y (how wrong we are)<br> | |
| Step 3: Build tree to fix errors<br> | |
| Step 4: F(x) = F(0) + learning_rate × tree(x)<br> | |
| Step 5: Repeat Steps 2-4 multiple times | |
| </div> | |
| <h3>Real Example: House Price ≥ 170k</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>ID</th> | |
| <th>Size</th> | |
| <th>Price</th> | |
| <th>≥170k?</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>1</td> | |
| <td>800</td> | |
| <td>120k</td> | |
| <td>0 (No)</td> | |
| </tr> | |
| <tr> | |
| <td>2</td> | |
| <td>900</td> | |
| <td>130k</td> | |
| <td>0 (No)</td> | |
| </tr> | |
| <tr> | |
| <td>3</td> | |
| <td>1000</td> | |
| <td>150k</td> | |
| <td>0 (No)</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>1100</td> | |
| <td>170k</td> | |
| <td>1 (Yes)</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>1200</td> | |
| <td>200k</td> | |
| <td>1 (Yes)</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Initialize F(0)</div> | |
| <div class="step-calculation"> | |
| F(0) = log(positive / negative) | |
| = log(2 / 3) | |
| = <strong style="color: #7ef0d4;">-0.405</strong> | |
| Meaning: 40.5% initial chance of ≥170k | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Calculate Gradients</div> | |
| <div class="step-calculation"> | |
| For House 1: | |
| p = sigmoid(-0.405) = <strong>0.4</strong> (40% probability) | |
| y = 0 (actual) | |
| gradient g = 0.4 - 0 = <strong style="color: #ff8c6a;">0.4</strong> | |
| For House 4: | |
| p = sigmoid(-0.405) = 0.4 | |
| y = 1 (actual) | |
| gradient g = 0.4 - 1 = <strong style="color: #7ef0d4;">-0.6</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Find Best Split</div> | |
| <div class="step-calculation"> | |
| Test split: Size < 1050 | |
| <strong>Left (Size ≤ 1050):</strong> Houses 1,2,3 | |
| Gradients: [0.4, 0.4, 0.4] | |
| Average = <strong>0.4</strong> | |
| <strong>Right (Size > 1050):</strong> Houses 4,5 | |
| Gradients: [-0.6, -0.6] | |
| Average = <strong>-0.6</strong> | |
| ✓ This split separates positive/negative gradients! | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Update Predictions</div> | |
| <div class="step-calculation"> | |
| F1(x) = F(0) + learning_rate × tree(x) | |
| For House 1 (Size=800): | |
| F1(1) = -0.405 + 0.1 × (-0.4) | |
| = -0.405 - 0.04 | |
| = <strong style="color: #7ef0d4;">-0.445</strong> | |
| New probability = sigmoid(-0.445) = <strong>0.39</strong> ✓ Lower! | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-class-sequential-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Sequential prediction updates across | |
| iterations</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-class-gradients-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Gradient values per sample showing error | |
| correction</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaway</div> | |
| <div class="callout-content"> | |
| Gradient Boosting Classification uses gradients (p - y) to sequentially build trees that | |
| correct probability predictions. Each tree reduces log loss by fitting to the errors! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 19a: Gradient Boosting (NEW FROM PDF) --> | |
| <div class="section" id="gradient-boosting"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Ensemble</span> Gradient Boosting</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Gradient Boosting is a powerful ensemble technique that builds models sequentially, where each | |
| new model corrects the errors (residuals) of the previous models. Unlike AdaBoost which adjusts | |
| sample weights, Gradient Boosting directly fits new models to the residual errors!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Sequential learning: Each tree fixes errors of previous</li> | |
| <li>Weak learners: Simple stumps (depth=1)</li> | |
| <li>Learning rate: Controls step size (0.1 = small steps)</li> | |
| <li>Residuals: What model got wrong</li> | |
| <li>SSE: Sum of Squared Errors (lower = better split)</li> | |
| </ul> | |
| </div> | |
| <h3>🎯 Complete Mathematical Solution (From PDF)</h3> | |
| <h4>Dataset: House Price Prediction</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>ID</th> | |
| <th>Size (sq ft)</th> | |
| <th>Bedrooms</th> | |
| <th>Price (₹ Lakhs)</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>1</td> | |
| <td>800</td> | |
| <td>2</td> | |
| <td>120</td> | |
| </tr> | |
| <tr> | |
| <td>2</td> | |
| <td>900</td> | |
| <td>2</td> | |
| <td>130</td> | |
| </tr> | |
| <tr> | |
| <td>3</td> | |
| <td>1000</td> | |
| <td>3</td> | |
| <td>150</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>1100</td> | |
| <td>3</td> | |
| <td>170</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>1200</td> | |
| <td>4</td> | |
| <td>200</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Learning Rate:</strong> lr = 0.1</p> | |
| <div class="step"> | |
| <div class="step-title">STEP 0: Initialize Model F(0)</div> | |
| <div class="step-calculation"> | |
| Formula: F(0) = mean(y) | |
| Calculation: | |
| F(0) = (120 + 130 + 150 + 170 + 200) / 5 | |
| = 770 / 5 | |
| = <strong style="color: #7ef0d4;">154</strong> | |
| <strong style="color: #7ef0d4;">✓ Result: F(0) = 154</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Compute Residuals</div> | |
| <div class="step-calculation"> | |
| Formula: r_i = y_i - F(0) | |
| <strong style="color: #6aa9ff;">Residual Calculation:</strong> | |
| ID | Size | Beds | Price(y) | Prediction F(0) | Residual r_i | |
| ---|------|------|----------|-----------------|------------- | |
| 1 | 800 | 2 | 120 | 154 | -34 | |
| 2 | 900 | 2 | 130 | 154 | -24 | |
| 3 | 1000 | 3 | 150 | 154 | -4 | |
| 4 | 1100 | 3 | 170 | 154 | +16 | |
| 5 | 1200 | 4 | 200 | 154 | +46 | |
| <strong style="color: #7ef0d4;">✓ Residuals: [-34, -24, -4, +16, +46]</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Find Best Split (Build Weak Learner h1)</div> | |
| <div class="step-calculation"> | |
| Test all candidate splits for Size feature: | |
| Midpoints: 850, 950, 1050, 1150 | |
| <strong style="color: #6aa9ff;">Test: Size < 1050</strong> | |
| ├─ Left (Size ≤ 1050): IDs 1,2,3 | |
| │ Residuals: [-34, -24, -4] | |
| │ Mean: (-34 + -24 + -4) / 3 = -62 / 3 = <strong>-20.66</strong> | |
| │ SSE: (-34-(-20.66))² + (-24-(-20.66))² + (-4-(-20.66))² | |
| │ = 177.78 + 11.11 + 277.78 = <strong>466.67</strong> | |
| │ | |
| └─ Right (Size > 1050): IDs 4,5 | |
| Residuals: [+16, +46] | |
| Mean: (16 + 46) / 2 = 62 / 2 = <strong>31.0</strong> | |
| SSE: (16-31)² + (46-31)² = 225 + 225 = <strong>450</strong> | |
| <strong style="color: #ff8c6a;">Total SSE = 466.67 + 450 = 916.67</strong> | |
| <strong style="color: #6aa9ff;">Test All Splits:</strong> | |
| Feature | Threshold | SSE | |
| ---------|-----------|-------- | |
| Size | 850 | 2675 | |
| Size | 950 | 1316.66 | |
| Size | 1050 | 916.67 ← BEST SPLIT | |
| Size | 1150 | 1475.0 | |
| Bedrooms | 2.5 | 1316.66 | |
| Bedrooms | 3.5 | 1475.0 | |
| <strong style="color: #7ef0d4; font-size: 18px;">✓ BEST SPLIT: Size < 1050 with SSE = | |
| 916.67</strong> | |
| Weak Learner h1(x): | |
| ├─ If Size ≤ 1050: h1(x) = -20.66 | |
| └─ If Size > 1050: h1(x) = 31.0 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Update Predictions</div> | |
| <div class="step-calculation"> | |
| Formula: F1(x) = F(0) + lr × h1(x) | |
| where lr = 0.1 | |
| <strong style="color: #6aa9ff;">For ID 1 (Size=800):</strong> | |
| F1(1) = 154 + 0.1 × (-20.66) | |
| = 154 - 2.066 | |
| = <strong style="color: #7ef0d4;">151.93</strong> ✓ | |
| <strong style="color: #6aa9ff;">For ID 4 (Size=1100):</strong> | |
| F1(4) = 154 + 0.1 × 31.0 | |
| = 154 + 3.10 | |
| = <strong style="color: #7ef0d4;">157.10</strong> ✓ | |
| <strong style="color: #6aa9ff;">Complete Table:</strong> | |
| ID | Size | Price(y) | F(0) | h1(x) | F1(x) | New Residual | |
| ---|------|----------|------|--------|--------|------------- | |
| 1 | 800 | 120 | 154 | -20.66 | 151.93 | -31.93 | |
| 2 | 900 | 130 | 154 | -20.66 | 151.93 | -21.93 | |
| 3 | 1000 | 150 | 154 | -20.66 | 151.93 | -1.93 | |
| 4 | 1100 | 170 | 154 | +31.0 | 157.10 | +12.90 | |
| 5 | 1200 | 200 | 154 | +31.0 | 157.10 | +42.90 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Repeat for h2, h3, ... h10</div> | |
| <div class="step-calculation"> | |
| Continue building weak learners on residuals: | |
| F(x) = F(0) + lr×h1(x) + lr×h2(x) + lr×h3(x) + ... + lr×h10(x) | |
| Each iteration: | |
| 1. Compute residuals | |
| 2. Find best split | |
| 3. Build weak learner | |
| 4. Update predictions | |
| After 10 iterations: | |
| <strong style="color: #7ef0d4; font-size: 18px;">Final Model: F(x) = 154 + 0.1×h1(x) + | |
| 0.1×h2(x) + ... + 0.1×h10(x)</strong> | |
| </div> | |
| </div> | |
| <h3>📊 Visualizations</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-sequential-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Sequential tree building - residuals | |
| decreasing over iterations</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-residuals-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Residual reduction across iterations</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-learning-rate-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 3:</strong> Learning rate effect - comparing lr=0.01, | |
| 0.1, 1.0</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-stumps-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 4:</strong> Weak learner stumps with decision | |
| boundaries</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="gb-predictions-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 5:</strong> Prediction vs actual - showing improvement | |
| </p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Takeaways</div> | |
| <div class="callout-content"> | |
| <strong>Why Gradient Boosting Works:</strong><br> | |
| • Each tree learns from previous mistakes (residuals)<br> | |
| • Learning rate prevents overfitting<br> | |
| • Simple weak learners combine into strong predictor<br> | |
| • SSE-based splits find best variance reduction<br> | |
| <br> | |
| <strong>Advantages:</strong><br> | |
| ✓ Very high accuracy<br> | |
| ✓ Handles non-linear relationships<br> | |
| ✓ Feature importance built-in<br> | |
| <br> | |
| <strong>Disadvantages:</strong><br> | |
| ✗ Sequential (can't parallelize)<br> | |
| ✗ Sensitive to overfitting<br> | |
| ✗ Requires careful tuning | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 19b-NEW: XGBoost Classification --> | |
| <div class="section" id="xgboost-classification"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Classification</span> XGBoost Classification</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>XGBoost Classification adds Hessian (2nd derivative) and regularization to Gradient Boosting for | |
| better accuracy and less overfitting!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Difference from Gradient Boosting</div> | |
| <ul class="info-card-list"> | |
| <li>GB: Uses gradient g = p - y</li> | |
| <li>XGB: Uses gradient g AND Hessian h = p(1-p)</li> | |
| <li>XGB: Adds regularization λ to prevent overfitting</li> | |
| <li>XGB: Better gain calculation for splits</li> | |
| </ul> | |
| </div> | |
| <div class="formula"> | |
| <strong>Hessian Formula:</strong><br> | |
| h = p × (1 - p)<br> | |
| <br> | |
| Measures confidence of prediction:<br> | |
| • p = 0.5 → h = 0.25 (most uncertain)<br> | |
| • p = 0.9 → h = 0.09 (very confident)<br> | |
| <br> | |
| <strong>Gain Formula:</strong><br> | |
| Gain = GL²/(HL+λ) + GR²/(HR+λ) - Gp²/(Hp+λ) | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="xgb-class-hessian-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Hessian values showing prediction confidence | |
| </p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Why XGBoost is Better</div> | |
| <div class="callout-content"> | |
| Hessian gives curvature information → better optimization path<br> | |
| Regularization λ prevents overfitting → better generalization<br> | |
| Result: State-of-the-art accuracy on classification tasks! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section 19b: XGBoost (NEW FROM PDF) --> | |
| <div class="section" id="xgboost"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Ensemble</span> XGBoost (Extreme Gradient Boosting)</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>XGBoost is an optimized implementation of gradient boosting that uses second-order derivatives | |
| (Hessian) and regularization for superior performance. It's the algorithm that wins most Kaggle | |
| competitions!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Uses 2nd order derivatives (Hessian) for better approximation</li> | |
| <li>Built-in regularization (λ) to prevent overfitting</li> | |
| <li>Better gain calculation for splits</li> | |
| <li>Handles parallelism and missing values</li> | |
| <li>Much faster than standard gradient boosting</li> | |
| </ul> | |
| </div> | |
| <h3>🎯 XGBoost vs Gradient Boosting</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Aspect</th> | |
| <th>Gradient Boosting</th> | |
| <th>XGBoost</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Derivatives</td> | |
| <td>1st order (gradient)</td> | |
| <td>1st + 2nd order (Hessian)</td> | |
| </tr> | |
| <tr> | |
| <td>Regularization</td> | |
| <td>None built-in</td> | |
| <td>L1 & L2 built-in (λ)</td> | |
| </tr> | |
| <tr> | |
| <td>Split Criterion</td> | |
| <td>MSE/MAE</td> | |
| <td>Gain with regularization</td> | |
| </tr> | |
| <tr> | |
| <td>Parallelism</td> | |
| <td>No</td> | |
| <td>Yes (tree building)</td> | |
| </tr> | |
| <tr> | |
| <td>Missing Values</td> | |
| <td>Must handle separately</td> | |
| <td>Built-in handling</td> | |
| </tr> | |
| <tr> | |
| <td>Speed</td> | |
| <td>Slower</td> | |
| <td>Much faster</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h3>🎯 Complete Mathematical Solution (From PDF)</h3> | |
| <p><strong>Using same dataset as Gradient Boosting:</strong></p> | |
| <div class="formula"> | |
| <strong>XGBoost Gain Formula:</strong> | |
| Gain = [GL² / (HL + λ)] + [GR² / (HR + λ)] - [G_parent² / (H_parent + λ)] | |
| <br><br> | |
| Where:<br> | |
| G = Gradient (1st derivative) = Σ(y - ŷ)<br> | |
| H = Hessian (2nd derivative) = Σ(1) for regression<br> | |
| λ = Regularization parameter (default = 1) | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 0: Initialize</div> | |
| <div class="step-calculation"> | |
| F(0) = mean(y) = <strong style="color: #7ef0d4;">154</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Compute Gradients and Hessians</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">For Regression (MSE loss):</strong> | |
| gradient g = (y - ŷ) | |
| hessian h = 1 (constant for MSE) | |
| ID | Size | Price(y) | F(0) | g=(y-ŷ) | h | |
| ---|------|----------|------|---------|--- | |
| 1 | 800 | 120 | 154 | -34 | 1 | |
| 2 | 900 | 130 | 154 | -24 | 1 | |
| 3 | 1000 | 150 | 154 | -4 | 1 | |
| 4 | 1100 | 170 | 154 | +16 | 1 | |
| 5 | 1200 | 200 | 154 | +46 | 1 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Test Split - Size < 950</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Split: Size < 950</strong> | |
| ├─ Left: IDs 1,2 (800, 900) | |
| │ GL = -34 + (-24) = <strong>-58</strong> | |
| │ HL = 1 + 1 = <strong>2</strong> | |
| │ | |
| └─ Right: IDs 3,4,5 (1000, 1100, 1200) | |
| GR = -4 + 16 + 46 = <strong>+58</strong> | |
| HR = 1 + 1 + 1 = <strong>3</strong> | |
| Parent: | |
| G_parent = -34 + (-24) + (-4) + 16 + 46 = <strong>0</strong> | |
| H_parent = <strong>5</strong> | |
| <strong style="color: #6aa9ff;">Calculate Scores (λ = 1):</strong> | |
| Score(Left) = -GL² / (HL + λ) | |
| = -(-58)² / (2 + 1) | |
| = -3364 / 3 | |
| = <strong>-1121.33</strong> | |
| Score(Right) = -GR² / (HR + λ) | |
| = -(58)² / (3 + 1) | |
| = -3364 / 4 | |
| = <strong>-841</strong> | |
| Score(Parent) = -G_parent² / (H_parent + λ) | |
| = -(0)² / (5 + 1) | |
| = <strong>0</strong> | |
| Gain = Score(Left) + Score(Right) - Score(Parent) | |
| = -1121.33 + (-841) - 0 | |
| = -1962.33 | |
| Use absolute value: Gain = <strong style="color: #7ef0d4; font-size: 18px;">1962.33</strong> | |
| ✓ HIGHEST GAIN | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Compute Leaf Weights</div> | |
| <div class="step-calculation"> | |
| Formula: w = -G / (H + λ) | |
| <strong style="color: #6aa9ff;">Left Leaf:</strong> | |
| w_left = -(-58) / (2 + 1) | |
| = 58 / 3 | |
| = <strong style="color: #7ef0d4;">19.33</strong> | |
| <strong style="color: #6aa9ff;">Right Leaf:</strong> | |
| w_right = -(58) / (3 + 1) | |
| = -58 / 4 | |
| = <strong style="color: #7ef0d4;">-14.5</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Update Predictions</div> | |
| <div class="step-calculation"> | |
| Formula: F1(x) = F(0) + lr × w | |
| <strong style="color: #6aa9ff;">Complete Table:</strong> | |
| ID | Size | Price(y) | F(0) | Leaf Weight | F1(x) | New Residual | |
| ---|------|----------|------|-------------|-----------|------------- | |
| 1 | 800 | 120 | 154 | -19.33 | 134.67 | -14.67 | |
| 2 | 900 | 130 | 154 | -19.33 | 134.67 | -4.67 | |
| 3 | 1000 | 150 | 154 | +14.5 | 168.50 | -18.50 | |
| 4 | 1100 | 170 | 154 | +14.5 | 168.50 | +1.50 | |
| 5 | 1200 | 200 | 154 | +14.5 | 168.50 | +31.50 | |
| </div> | |
| </div> | |
| <h3>📊 Visualizations</h3> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="xgb-gain-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Gain calculation showing GL, GR, HL, HR for | |
| each split</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="xgb-regularization-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Regularization effect - comparing λ=0, 1, | |
| 10</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="xgb-hessian-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 3:</strong> Hessian contribution to better optimization | |
| </p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 350px"> | |
| <canvas id="xgb-leaf-weights-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 4:</strong> Leaf weight calculation breakdown</p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="xgb-comparison-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 5:</strong> Gradient Boosting vs XGBoost performance | |
| comparison</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Key Advantages of XGBoost</div> | |
| <div class="callout-content"> | |
| <strong>Mathematical Improvements:</strong><br> | |
| ✓ 2nd order derivatives → Better approximation<br> | |
| ✓ Regularization (λ) → Prevents overfitting<br> | |
| ✓ Gain-based splitting → More accurate<br> | |
| <br> | |
| <strong>Engineering Improvements:</strong><br> | |
| ✓ Parallel processing → Faster training<br> | |
| ✓ Handles missing values → More robust<br> | |
| ✓ Built-in cross-validation → Easy tuning<br> | |
| ✓ Tree pruning → Better generalization<br> | |
| ✓ Cache optimization → Memory efficient<br> | |
| <br> | |
| <strong>Real-World Impact:</strong><br> | |
| • Most popular algorithm for structured data<br> | |
| • Dominates Kaggle competitions<br> | |
| • Used by: Uber, Airbnb, Microsoft, etc. | |
| </div> | |
| </div> | |
| <h3>Hyperparameter Guide</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Parameter</th> | |
| <th>Description</th> | |
| <th>Typical Values</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>learning_rate (η)</td> | |
| <td>Step size shrinkage</td> | |
| <td>0.01 - 0.3</td> | |
| </tr> | |
| <tr> | |
| <td>n_estimators</td> | |
| <td>Number of trees</td> | |
| <td>100 - 1000</td> | |
| </tr> | |
| <tr> | |
| <td>max_depth</td> | |
| <td>Tree depth</td> | |
| <td>3 - 10</td> | |
| </tr> | |
| <tr> | |
| <td>lambda (λ)</td> | |
| <td>L2 regularization</td> | |
| <td>0 - 10</td> | |
| </tr> | |
| <tr> | |
| <td>alpha (α)</td> | |
| <td>L1 regularization</td> | |
| <td>0 - 10</td> | |
| </tr> | |
| <tr> | |
| <td>subsample</td> | |
| <td>Row sampling</td> | |
| <td>0.5 - 1.0</td> | |
| </tr> | |
| <tr> | |
| <td>colsample_bytree</td> | |
| <td>Column sampling</td> | |
| <td>0.5 - 1.0</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| </div> | |
| </div> | |
| <!-- Section 19c: Bagging (Renamed) --> | |
| <div class="section" id="bagging"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Ensemble</span> Bagging (Bootstrap Aggregating)</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Bagging trains multiple models on different random subsets of data (with replacement), then | |
| averages predictions. It's the foundation for Random Forest!</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="bagging-complete-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Bagging process showing 3 trees and averaged | |
| prediction</p> | |
| </div> | |
| <p>See the <a href="#ensemble-methods">Ensemble Methods Overview</a> section below for complete | |
| mathematical walkthrough.</p> | |
| </div> | |
| </div> | |
| <!-- Section 19d: Boosting/AdaBoost (Renamed) --> | |
| <div class="section" id="boosting-adaboost"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Ensemble</span> Boosting (AdaBoost)</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>AdaBoost trains models sequentially, where each new model focuses on examples the previous models | |
| got wrong by adjusting sample weights.</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="boosting-complete-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Boosting rounds showing weight updates and | |
| error reduction</p> | |
| </div> | |
| <p>See the <a href="#ensemble-methods">Ensemble Methods Overview</a> section below for complete | |
| mathematical walkthrough.</p> | |
| </div> | |
| </div> | |
| <!-- Section 19e: Random Forest (Renamed) --> | |
| <div class="section" id="random-forest"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 Supervised | |
| - Ensemble</span> Random Forest</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Random Forest combines bagging with feature randomness. Each tree is trained on a bootstrap | |
| sample AND considers only random subsets of features at each split!</p> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 500px"> | |
| <canvas id="rf-complete-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Random Forest showing feature randomness and | |
| OOB validation</p> | |
| </div> | |
| <p>See the <a href="#ensemble-methods">Ensemble Methods Overview</a> section below for complete | |
| mathematical walkthrough.</p> | |
| </div> | |
| </div> | |
| <!-- Section 19: Ensemble Methods (COMPREHENSIVE FROM PDF) --> | |
| <div class="section" id="ensemble-methods"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">📊 | |
| Supervised</span> Ensemble Methods</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>"Wisdom of the crowds" applied to machine learning! Ensemble methods combine multiple weak | |
| learners to create a strong learner. They power most Kaggle competition winners!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li>Combine multiple models for better predictions</li> | |
| <li>Bagging: Train on random subsets (parallel)</li> | |
| <li>Boosting: Sequential learning from mistakes</li> | |
| <li>Stacking: Meta-learner combines base models</li> | |
| </ul> | |
| </div> | |
| <h3>Why Ensembles Work</h3> | |
| <p>Imagine 100 doctors diagnosing a patient. Even if each is 70% accurate individually, their | |
| majority vote is 95%+ accurate! Same principle applies to ML.</p> | |
| <div class="callout success"> | |
| <div class="callout-title">🎯 The Magic of Diversity</div> | |
| <div class="callout-content"> | |
| <strong>Key insight:</strong> Each model makes DIFFERENT errors!<br> | |
| <br> | |
| Model A: Correct on samples [1,2,3,5,7,9] - 60% accuracy<br> | |
| Model B: Correct on samples [2,4,5,6,8,10] - 60% accuracy<br> | |
| Model C: Correct on samples [1,3,4,6,7,8] - 60% accuracy<br> | |
| <br> | |
| <strong>Majority vote:</strong> Correct on [1,2,3,4,5,6,7,8] - 80% accuracy!<br> | |
| <br> | |
| Diversity reduces variance! | |
| </div> | |
| </div> | |
| <h3>🎯 Method 1: Bagging (Bootstrap Aggregating) - Complete Walkthrough</h3> | |
| <p>Train multiple models on different random subsets of data (with replacement), then average | |
| predictions.</p> | |
| <h4>Dataset: 6 Properties</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Row</th> | |
| <th>Square Feet</th> | |
| <th>Price (Lakhs)</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>A</td> | |
| <td>900</td> | |
| <td>70</td> | |
| </tr> | |
| <tr> | |
| <td>B</td> | |
| <td>1000</td> | |
| <td>80</td> | |
| </tr> | |
| <tr> | |
| <td>C</td> | |
| <td>900</td> | |
| <td>70</td> | |
| </tr> | |
| <tr> | |
| <td>D</td> | |
| <td>1500</td> | |
| <td>90</td> | |
| </tr> | |
| <tr> | |
| <td>E</td> | |
| <td>1600</td> | |
| <td>95</td> | |
| </tr> | |
| <tr> | |
| <td>F</td> | |
| <td>1700</td> | |
| <td>100</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Create Bootstrap Samples (WITH Replacement)</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Bootstrap Sample 1:</strong> | |
| Randomly pick 6 samples WITH replacement: | |
| ├─ Row A: 900 sq ft, ₹70L (sampled TWICE!) | |
| ├─ Row A: 900 sq ft, ₹70L (duplicate) | |
| ├─ Row B: 1000 sq ft, ₹80L | |
| ├─ Row D: 1500 sq ft, ₹90L | |
| ├─ Row E: 1600 sq ft, ₹95L | |
| └─ Row F: 1700 sq ft, ₹100L | |
| <strong style="color: #6aa9ff;">Bootstrap Sample 2:</strong> | |
| ├─ Row C: 900 sq ft, ₹70L | |
| ├─ Row D: 1500 sq ft, ₹90L | |
| ├─ Row E: 1600 sq ft, ₹95L | |
| ├─ Row E: 1600 sq ft, ₹95L (sampled TWICE!) | |
| ├─ Row F: 1700 sq ft, ₹100L | |
| └─ Row B: 1000 sq ft, ₹80L | |
| <strong style="color: #6aa9ff;">Bootstrap Sample 3:</strong> | |
| ├─ Row F: 1700 sq ft, ₹100L | |
| ├─ Row C: 900 sq ft, ₹70L | |
| ├─ Row E: 1600 sq ft, ₹95L | |
| ├─ Row A: 900 sq ft, ₹70L | |
| ├─ Row B: 1000 sq ft, ₹80L | |
| └─ Row D: 1500 sq ft, ₹90L | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Train Separate Model on Each Sample</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #7ef0d4;">Tree 1:</strong> Trained on Sample 1 | |
| • Learns splits based on its data | |
| • For 950 sq ft → Predicts: <strong>₹75L</strong> | |
| <strong style="color: #7ef0d4;">Tree 2:</strong> Trained on Sample 2 | |
| • Different data → Different splits! | |
| • For 950 sq ft → Predicts: <strong>₹72L</strong> | |
| <strong style="color: #7ef0d4;">Tree 3:</strong> Trained on Sample 3 | |
| • Yet another perspective | |
| • For 950 sq ft → Predicts: <strong>₹78L</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Aggregate Predictions (Average)</div> | |
| <div class="step-calculation"> | |
| For test property with 950 sq ft: | |
| Prediction₁ = ₹75L | |
| Prediction₂ = ₹72L | |
| Prediction₃ = ₹78L | |
| <strong style="color: #6aa9ff;">Final Bagging Prediction:</strong> | |
| Average = (75 + 72 + 78) / 3 | |
| = 225 / 3 | |
| = <strong style="color: #7ef0d4; font-size: 18px;">₹75 Lakhs</strong> ✓ | |
| <strong style="color: #7ef0d4;">Why it works:</strong> | |
| • Each tree makes slightly different errors | |
| • Averaging reduces overall variance | |
| • More stable than single tree! | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="bagging-ensemble-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Bagging process showing 3 trees and averaged | |
| prediction</p> | |
| </div> | |
| <div class="formula"> | |
| <strong>Bagging Algorithm:</strong><br> | |
| 1. Create B bootstrap samples (random sampling with replacement)<br> | |
| 2. Train a model on each sample independently<br> | |
| 3. For prediction:<br> | |
| • Regression: Average all predictions<br> | |
| • Classification: Majority vote<br> | |
| <br> | |
| <strong>Effect:</strong> Reduces variance, prevents overfitting | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="bagging-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Bagging process - multiple models from | |
| bootstrap samples</p> | |
| </div> | |
| <h3>🎯 Method 2: Boosting (Sequential Learning) - Complete Walkthrough</h3> | |
| <p>Train models sequentially, where each new model focuses on examples the previous models got | |
| wrong.</p> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Round 1 - Train Model 1 on All Data (Equal Weights)</div> | |
| <div class="step-calculation"> | |
| Original Dataset (all samples weighted equally: w=1.0): | |
| ├─ 800 sq ft → Actual: ₹50L | |
| ├─ 850 sq ft → Actual: ₹52L | |
| ├─ 900 sq ft → Actual: ₹54L | |
| ├─ 1500 sq ft → Actual: ₹90L | |
| ├─ 1600 sq ft → Actual: ₹95L | |
| └─ 1700 sq ft → Actual: ₹100L | |
| <strong style="color: #6aa9ff;">Model 1 Predictions:</strong> | |
| ├─ 800 sq ft → Predicts: ₹70L (Error: -20) | |
| ├─ 850 sq ft → Predicts: ₹72L (Error: -20) | |
| ├─ 900 sq ft → Predicts: ₹75L (Error: -21) | |
| ├─ 1500 sq ft → Predicts: ₹88L (Error: +2) ⚠️ | |
| ├─ 1600 sq ft → Predicts: ₹92L (Error: +3) ⚠️ | |
| └─ 1700 sq ft → Predicts: ₹98L (Error: +2) ⚠️ | |
| <strong style="color: #ff8c6a;">Large errors on rows 4, 5, 6!</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Round 2 - Increase Weights on Misclassified</div> | |
| <div class="step-calculation"> | |
| Update weights based on errors: | |
| ├─ Row 1: w = 1.0 (small error) | |
| ├─ Row 2: w = 1.0 (small error) | |
| ├─ Row 3: w = 1.0 (small error) | |
| ├─ Row 4: w = 2.5 (large error → FOCUS!) 🎯 | |
| ├─ Row 5: w = 3.0 (large error → FOCUS!) 🎯 | |
| └─ Row 6: w = 2.5 (large error → FOCUS!) 🎯 | |
| <strong style="color: #6aa9ff;">Train Model 2 with these weights:</strong> | |
| Model 2 focuses on the high-priced properties! | |
| <strong style="color: #6aa9ff;">Model 2 Predictions:</strong> | |
| ├─ 800 sq ft → ₹71L (Error: -21) | |
| ├─ 850 sq ft → ₹73L (Error: -21) | |
| ├─ 900 sq ft → ₹74L (Error: -20) | |
| ├─ 1500 sq ft → ₹90L (Error: 0) ✓ | |
| ├─ 1600 sq ft → ₹94L (Error: +1) ✓ | |
| └─ 1700 sq ft → ₹100L (Error: 0) ✓ | |
| <strong style="color: #7ef0d4;">Better on high-priced properties!</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Round 3 - Further Refine</div> | |
| <div class="step-calculation"> | |
| Update weights again: | |
| ├─ Rows 1,2,3 still have errors → increase weights | |
| ├─ Rows 4,5,6 now accurate → decrease weights | |
| <strong style="color: #6aa9ff;">Model 3 Predictions:</strong> | |
| ├─ 800 sq ft → ₹70L (Error: -20) | |
| ├─ 850 sq ft → ₹72L (Error: -20) | |
| ├─ 900 sq ft → ₹75L (Error: -21) | |
| ├─ 1500 sq ft → ₹89L (Error: +1) | |
| ├─ 1600 sq ft → ₹93L (Error: +2) | |
| └─ 1700 sq ft → ₹99L (Error: +1) | |
| <strong style="color: #7ef0d4;">All errors minimized!</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Combine with Weights</div> | |
| <div class="step-calculation"> | |
| Model weights (based on accuracy): | |
| • Model 1: α₁ = 0.2 (least accurate) | |
| • Model 2: α₂ = 0.3 (medium accuracy) | |
| • Model 3: α₃ = 0.5 (most accurate) | |
| <strong style="color: #6aa9ff;">Final Prediction for 950 sq ft:</strong> | |
| Weighted Average = α₁×Pred₁ + α₂×Pred₂ + α₃×Pred₃ | |
| = 0.2×75 + 0.3×74 + 0.5×75 | |
| = 15 + 22.2 + 37.5 | |
| = <strong style="color: #7ef0d4; font-size: 18px;">₹74.7 Lakhs</strong> ✓ | |
| <strong style="color: #7ef0d4;">More accurate than any single model!</strong> | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="boosting-ensemble-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Boosting rounds showing weight updates and | |
| error reduction</p> | |
| </div> | |
| <div class="formula"> | |
| <strong>Boosting Algorithm:</strong><br> | |
| 1. Start with equal weights for all samples<br> | |
| 2. Train model on weighted data<br> | |
| 3. Increase weights for misclassified samples<br> | |
| 4. Train next model (focuses on hard examples)<br> | |
| 5. Repeat for M iterations<br> | |
| 6. Final prediction = weighted vote of all models<br> | |
| <br> | |
| <strong>Effect:</strong> Reduces bias AND variance | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="boosting-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Boosting iteration - focusing on | |
| misclassified points</p> | |
| </div> | |
| <h3>🎯 Random Forest: Complete Walkthrough (From PDF)</h3> | |
| <p>The most popular ensemble method! Combines bagging with feature randomness.</p> | |
| <h4>Dataset: House Prices with 3 Features</h4> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Square Feet</th> | |
| <th>Bedrooms</th> | |
| <th>Age (years)</th> | |
| <th>Price (Lakhs)</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>800</td> | |
| <td>2</td> | |
| <td>10</td> | |
| <td>50</td> | |
| </tr> | |
| <tr> | |
| <td>850</td> | |
| <td>2</td> | |
| <td>8</td> | |
| <td>52</td> | |
| </tr> | |
| <tr> | |
| <td>900</td> | |
| <td>2</td> | |
| <td>5</td> | |
| <td>54</td> | |
| </tr> | |
| <tr> | |
| <td>1500</td> | |
| <td>3</td> | |
| <td>3</td> | |
| <td>90</td> | |
| </tr> | |
| <tr> | |
| <td>1600</td> | |
| <td>3</td> | |
| <td>2</td> | |
| <td>95</td> | |
| </tr> | |
| <tr> | |
| <td>1700</td> | |
| <td>4</td> | |
| <td>1</td> | |
| <td>100</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Key Parameter:</strong> Max Features = 2 (random subset at each split)</p> | |
| <div class="step"> | |
| <div class="step-title">STEP 1: Tree 1 - Random Features at Each Split</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Bootstrap Sample 1:</strong> {A, A, B, D, E, F} | |
| <strong>Root Split:</strong> | |
| Available features: [Square Feet, Bedrooms, Age] | |
| Randomly select 2: <strong style="color: #7ef0d4;">[Square Feet, Age]</strong> | |
| Test splits: | |
| • Square Feet = 1200: Variance Reduction = 450 ← BEST! | |
| • Age = 5: Variance Reduction = 120 | |
| <strong style="color: #7ef0d4;">Choose: Split at Square Feet = 1200</strong> | |
| <strong>Left Child Split:</strong> | |
| Samples: {A, A, B} - all small houses | |
| Randomly select 2: <strong style="color: #7ef0d4;">[Bedrooms, Age]</strong> | |
| • Both have 2 bedrooms → split by Age | |
| • Age = 9: Best split | |
| <strong>Right Child Split:</strong> | |
| Samples: {D, E, F} - all large houses | |
| Randomly select 2: <strong style="color: #7ef0d4;">[Square Feet, Bedrooms]</strong> | |
| • Split at Square Feet = 1550 | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 2: Tree 2 - Different Bootstrap, Different Features</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Bootstrap Sample 2:</strong> {B, C, C, D, E, F} | |
| <strong>Root Split:</strong> | |
| Randomly select 2: <strong style="color: #7ef0d4;">[Square Feet, Bedrooms]</strong> | |
| (different!) | |
| • Square Feet = 1100: Variance Reduction = 420 | |
| • Bedrooms = 2.5: Variance Reduction = 380 | |
| <strong style="color: #7ef0d4;">Choose: Split at Square Feet = 1100</strong> | |
| This tree has DIFFERENT structure than Tree 1! | |
| → More diversity = Better ensemble | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 3: Continue for 100 Trees</div> | |
| <div class="step-calculation"> | |
| Repeat process 100 times: | |
| • Each tree gets different bootstrap sample | |
| • Each split considers different random features | |
| • Creates 100 diverse trees! | |
| Tree predictions for 950 sq ft: | |
| ├─ Tree 1: ₹74L | |
| ├─ Tree 2: ₹76L | |
| ├─ Tree 3: ₹75L | |
| ├─ Tree 4: ₹73L | |
| ├─ ... | |
| └─ Tree 100: ₹75L | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 4: Average All Predictions</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">Final Random Forest Prediction:</strong> | |
| Average of 100 trees: | |
| = (74 + 76 + 75 + 73 + ... + 75) / 100 | |
| = <strong style="color: #7ef0d4; font-size: 18px;">₹75.2 Lakhs</strong> ✓ | |
| Confidence interval (std dev): | |
| = ±2.3 Lakhs | |
| <strong style="color: #7ef0d4;">Result: ₹75.2L ± ₹2.3L</strong> | |
| </div> | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">STEP 5: Out-of-Bag (OOB) Error Estimation</div> | |
| <div class="step-calculation"> | |
| <strong style="color: #6aa9ff;">OOB Validation (FREE!):</strong> | |
| For each original sample: | |
| ├─ Find trees that did NOT include it in bootstrap | |
| ├─ Use those trees to predict | |
| ├─ Compare with actual value | |
| Example - Row A (800 sq ft, ₹50L): | |
| ├─ Not in bootstrap of Trees: 12, 25, 38, 51, ..., 94 | |
| ├─ Average prediction from those trees: ₹48.5L | |
| ├─ Error: |50 - 48.5| = 1.5L | |
| Repeat for all 6 samples: | |
| OOB MAE = Average of all errors = <strong style="color: #7ef0d4;">₹2.1L</strong> | |
| <strong style="color: #7ef0d4;">✓ Estimate test error WITHOUT separate test set!</strong> | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 500px"> | |
| <canvas id="rf-ensemble-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Random Forest showing feature randomness and | |
| OOB validation</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Why Random Forest Works So Well</div> | |
| <div class="callout-content"> | |
| <strong>Two sources of randomness:</strong><br> | |
| 1. <strong>Bootstrap sampling:</strong> Each tree sees different data<br> | |
| 2. <strong>Feature randomness:</strong> Each split considers random feature subset<br> | |
| <br> | |
| This creates diverse trees that make DIFFERENT errors!<br> | |
| → Averaging cancels out individual mistakes<br> | |
| → More robust than bagging alone<br> | |
| <br> | |
| <strong>Bonus:</strong> OOB samples give free validation estimate! | |
| </div> | |
| </div> | |
| <div class="formula"> | |
| <strong>Random Forest Algorithm:</strong><br> | |
| 1. Create B bootstrap samples<br> | |
| 2. For each sample:<br> | |
| • Grow decision tree<br> | |
| • At each split, consider random subset of features<br> | |
| • Don't prune (let trees overfit!)<br> | |
| 3. Final prediction = average/vote of all trees<br> | |
| <br> | |
| <strong>Typical values:</strong> B=100-500 trees, √features per split | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="random-forest-viz" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 3:</strong> Random Forest - multiple diverse trees | |
| voting</p> | |
| </div> | |
| <h3>Comparison: Bagging vs Boosting</h3> | |
| <table class="data-table"> | |
| <thead> | |
| <tr> | |
| <th>Aspect</th> | |
| <th>Bagging</th> | |
| <th>Boosting</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Training</td> | |
| <td>Parallel (independent)</td> | |
| <td>Sequential (dependent)</td> | |
| </tr> | |
| <tr> | |
| <td>Focus</td> | |
| <td>Reduce variance</td> | |
| <td>Reduce bias & variance</td> | |
| </tr> | |
| <tr> | |
| <td>Weights</td> | |
| <td>Equal for all samples</td> | |
| <td>Higher for hard samples</td> | |
| </tr> | |
| <tr> | |
| <td>Speed</td> | |
| <td>Fast (parallelizable)</td> | |
| <td>Slower (sequential)</td> | |
| </tr> | |
| <tr> | |
| <td>Overfitting</td> | |
| <td>Resistant</td> | |
| <td>Can overfit if too many iterations</td> | |
| </tr> | |
| <tr> | |
| <td>Examples</td> | |
| <td>Random Forest</td> | |
| <td>AdaBoost, Gradient Boosting, XGBoost</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h3>Real-World Success Stories</h3> | |
| <ul> | |
| <li><strong>Netflix Prize (2009):</strong> Winning team used ensemble of 100+ models</li> | |
| <li><strong>Kaggle competitions:</strong> 99% of winners use ensembles</li> | |
| <li><strong>XGBoost:</strong> Most popular algorithm for structured data</li> | |
| <li><strong>Random Forests:</strong> Default choice for many data scientists</li> | |
| </ul> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 When to Use Each Method</div> | |
| <div class="callout-content"> | |
| <strong>Use Random Forest when:</strong><br> | |
| • You want good accuracy with minimal tuning<br> | |
| • You have high-variance base models<br> | |
| • Interpretability is secondary<br> | |
| <br> | |
| <strong>Use Gradient Boosting (XGBoost) when:</strong><br> | |
| • You want maximum accuracy<br> | |
| • You can afford hyperparameter tuning<br> | |
| • You have high-bias base models<br> | |
| <br> | |
| <strong>Use Stacking when:</strong><br> | |
| • You want to combine very different model types<br> | |
| • You're in a competition (squeeze every 0.1%!) | |
| </div> | |
| </div> | |
| <h3>🎉 Course Complete!</h3> | |
| <p style="font-size: 18px; color: #7ef0d4; margin-top: 24px;"> | |
| Congratulations! You've mastered all 17 machine learning topics - from basic linear regression | |
| to advanced ensemble methods! You now have the knowledge to: | |
| </p> | |
| <ul style="color: #7ef0d4; font-size: 16px;"> | |
| <li>Choose the right algorithm for any problem</li> | |
| <li>Understand the math behind each method</li> | |
| <li>Tune hyperparameters systematically</li> | |
| <li>Evaluate models properly</li> | |
| <li>Build production-ready ML systems</li> | |
| </ul> | |
| <p style="font-size: 18px; color: #7ef0d4; margin-top: 16px;"> | |
| Keep practicing, building projects, and exploring! The ML journey never ends. 🚀✨ | |
| </p> | |
| </div> | |
| </div> | |
| <!-- Section: Hierarchical Clustering --> | |
| <div class="section" id="hierarchical-clustering"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔍 | |
| Unsupervised - Clustering</span> Hierarchical Clustering</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>Hierarchical Clustering builds a tree of clusters by repeatedly merging the closest pairs. No | |
| need to specify K upfront!</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Simple Steps</div> | |
| <ul class="info-card-list"> | |
| <li>Step 1: Start with each point as its own cluster</li> | |
| <li>Step 2: Find two closest clusters</li> | |
| <li>Step 3: Merge them into one cluster</li> | |
| <li>Step 4: Repeat until all in one cluster</li> | |
| <li>Result: Dendrogram tree showing hierarchy</li> | |
| </ul> | |
| </div> | |
| <div class="formula"> | |
| <strong>Distance Metrics:</strong><br> | |
| Euclidean: d = √((x2-x1)² + (y2-y1)²)<br> | |
| Manhattan: d = |x2-x1| + |y2-y1|<br> | |
| <br> | |
| <strong>Linkage Methods:</strong><br> | |
| • Complete: max distance between any two points<br> | |
| • Single: min distance between any two points<br> | |
| • Average: average distance between all points<br> | |
| • Ward: minimizes variance (BEST for most cases) | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="hierarchical-dendrogram-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> Dendrogram showing cluster merging history | |
| </p> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 When to Use</div> | |
| <div class="callout-content"> | |
| ✓ Don't know number of clusters<br> | |
| ✓ Want to see cluster hierarchy<br> | |
| ✓ Small to medium datasets (<5000 points)<br> | |
| ✓ Need interpretable results | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: DBSCAN --> | |
| <div class="section" id="dbscan"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔍 | |
| Unsupervised - Clustering</span> DBSCAN Clustering</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>DBSCAN finds clusters of arbitrary shapes and automatically detects outliers! Based on density, | |
| not distance to centroids.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Parameters</div> | |
| <ul class="info-card-list"> | |
| <li>eps: Neighborhood radius (e.g., 0.4)</li> | |
| <li>min_samples: Minimum points in neighborhood (e.g., 3)</li> | |
| <li>Core point: Has ≥ min_samples within eps</li> | |
| <li>Border point: Near core point but not core itself</li> | |
| <li>Outlier: Not near any core point</li> | |
| </ul> | |
| </div> | |
| <div class="formula"> | |
| <strong>Simple Algorithm:</strong><br> | |
| Step 1: Pick random unvisited point<br> | |
| Step 2: Find all points within eps radius<br> | |
| Step 3: If count ≥ min_samples → Core point!<br> | |
| Step 4: Mark all reachable points in same cluster<br> | |
| Step 5: Move to next unvisited point<br> | |
| Step 6: Points alone = Outliers ❌ | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Example: eps=0.4, min_samples=3</div> | |
| <div class="step-calculation"> | |
| <strong>Point A at (1, 1):</strong> | |
| Points within 0.4 units: [A, B, C] | |
| Count = 3 ✓ Core point! | |
| Start Cluster 1 with A, B, C | |
| <strong>Point D at (8, 8):</strong> | |
| Points within 0.4 units: [D, E] | |
| Count = 2 ✗ Not core | |
| But near core E → Border point in Cluster 2 | |
| <strong>Point G at (5, 5):</strong> | |
| No neighbors within 0.4 | |
| Mark as <strong style="color: #ff8c6a;">OUTLIER</strong> ❌ | |
| </div> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 450px"> | |
| <canvas id="dbscan-clusters-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure:</strong> DBSCAN showing core, border, and outlier | |
| points</p> | |
| </div> | |
| <div class="callout success"> | |
| <div class="callout-title">✅ Advantages</div> | |
| <div class="callout-content"> | |
| ✓ Finds clusters of ANY shape<br> | |
| ✓ Automatically detects outliers<br> | |
| ✓ No need to specify number of clusters<br> | |
| ✓ Robust to noise | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: Clustering Evaluation --> | |
| <div class="section" id="clustering-evaluation"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(126, 240, 212, 0.3); color: #7ef0d4;">🔍 | |
| Unsupervised - Evaluation</span> Clustering Evaluation Metrics</h2> | |
| <button class="section-toggle collapsed">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>How do we know if our clustering is good? Use Silhouette Coefficient and Calinski-Harabasz Index! | |
| </p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Metrics</div> | |
| <ul class="info-card-list"> | |
| <li>Silhouette: Measures how well points fit in clusters</li> | |
| <li>Range: -1 to +1 (higher is better)</li> | |
| <li>Calinski-Harabasz: Between-cluster vs within-cluster variance</li> | |
| <li>Range: 0 to ∞ (higher is better)</li> | |
| </ul> | |
| </div> | |
| <h3>Silhouette Coefficient</h3> | |
| <div class="formula"> | |
| <strong>For each point:</strong><br> | |
| a = average distance to points in SAME cluster<br> | |
| b = average distance to points in NEAREST cluster<br> | |
| <br> | |
| Silhouette = (b - a) / max(a, b)<br> | |
| <br> | |
| <strong>Interpretation:</strong><br> | |
| +0.7 to +1.0: Excellent clustering<br> | |
| +0.5 to +0.7: Good clustering<br> | |
| +0.25 to +0.5: Weak clustering<br> | |
| < +0.25: Poor or no clustering | |
| </div> | |
| <div class="step"> | |
| <div class="step-title">Example Calculation</div> | |
| <div class="step-calculation"> | |
| <strong>Point A in Cluster 1:</strong> | |
| Distance to other points in Cluster 1: [0.1, 0.2] | |
| a = average = <strong>0.15</strong> | |
| Distance to nearest points in Cluster 2: [1.5, 1.8] | |
| b = average = <strong>1.65</strong> | |
| Silhouette(A) = (1.65 - 0.15) / 1.65 | |
| = 1.5 / 1.65 | |
| = <strong style="color: #7ef0d4;">0.909</strong> ✓ Excellent! | |
| </div> | |
| </div> | |
| <h3>Calinski-Harabasz Index</h3> | |
| <div class="formula"> | |
| <strong>Formula:</strong><br> | |
| CH = (Between-cluster variance) / (Within-cluster variance)<br> | |
| <br> | |
| <strong>Interpretation:</strong><br> | |
| 0-20: Poor clustering<br> | |
| 20-50: Okay clustering<br> | |
| 50-150: Good clustering<br> | |
| 150-500: Very good clustering<br> | |
| > 500: Excellent clustering | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="silhouette-plot-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 1:</strong> Silhouette plot showing score per cluster | |
| </p> | |
| </div> | |
| <div class="figure"> | |
| <div class="figure-placeholder" style="height: 400px"> | |
| <canvas id="ch-index-canvas" style="width: 100%; height: 100%;"></canvas> | |
| </div> | |
| <p class="figure-caption"><strong>Figure 2:</strong> Calinski-Harabasz index vs number of | |
| clusters</p> | |
| </div> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Choosing the Right Metric</div> | |
| <div class="callout-content"> | |
| <strong>Silhouette:</strong> Best for interpretability, shows per-point quality<br> | |
| <strong>CH Index:</strong> Fast to compute, good for finding optimal k<br> | |
| <strong>Both together:</strong> Most reliable assessment! | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Section: PCA (NEW) --> | |
| <div class="section" id="pca"> | |
| <div class="section-header"> | |
| <h2><span class="badge" style="background: rgba(106, 169, 255, 0.3); color: #6aa9ff;">🔍 | |
| Unsupervised | |
| - Dimensionality Reduction</span> Principal Component Analysis (PCA)</h2> | |
| <button class="section-toggle">▼</button> | |
| </div> | |
| <div class="section-body"> | |
| <p>PCA is the most popular technique for reducing the number of features while preserving as much | |
| information as possible. It transforms your data into a new coordinate system where the axes | |
| (principal components) are ordered by importance.</p> | |
| <div class="info-card"> | |
| <div class="info-card-title">Key Concepts</div> | |
| <ul class="info-card-list"> | |
| <li><strong>Dimensionality Reduction:</strong> Reduce features while keeping important info | |
| </li> | |
| <li><strong>Principal Components:</strong> New features ordered by variance explained</li> | |
| <li><strong>Eigenvalues:</strong> Tell how much variance each component captures</li> | |
| <li><strong>Eigenvectors:</strong> Tell the direction of each component</li> | |
| </ul> | |
| </div> | |
| <h3>Why Use PCA?</h3> | |
| <ul> | |
| <li><strong>Curse of Dimensionality:</strong> Many features → sparse data → poor models</li> | |
| <li><strong>Visualization:</strong> Reduce to 2-3D for plotting</li> | |
| <li><strong>Speed:</strong> Fewer features = faster training</li> | |
| <li><strong>Noise Reduction:</strong> Lower components often capture noise</li> | |
| </ul> | |
| <!-- COMPREHENSIVE MATH SECTION --> | |
| <div class="info-card" | |
| style="background: linear-gradient(135deg, rgba(106, 169, 255, 0.1), rgba(126, 240, 212, 0.1)); border: 2px solid #6aa9ff; margin-top: 32px;"> | |
| <h3 style="color: #6aa9ff; margin-bottom: 20px;">📐 Complete Mathematical Derivation: PCA | |
| Step-by-Step</h3> | |
| <p style="color: #7ef0d4; font-weight: bold;">Let's reduce 2D data to 1D step-by-step!</p> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Original Data (5 points, 2 features)</strong><br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Point</th> | |
| <th style="padding: 8px;">x₁</th> | |
| <th style="padding: 8px;">x₂</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">A</td> | |
| <td style="text-align: center;">2.5</td> | |
| <td style="text-align: center;">2.4</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">B</td> | |
| <td style="text-align: center;">0.5</td> | |
| <td style="text-align: center;">0.7</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">C</td> | |
| <td style="text-align: center;">2.2</td> | |
| <td style="text-align: center;">2.9</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">D</td> | |
| <td style="text-align: center;">1.9</td> | |
| <td style="text-align: center;">2.2</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">E</td> | |
| <td style="text-align: center;">3.1</td> | |
| <td style="text-align: center;">3.0</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 1: Center the Data (subtract mean)</strong><br><br> | |
| <strong>Calculate means:</strong><br> | |
| x̄₁ = (2.5 + 0.5 + 2.2 + 1.9 + 3.1) / 5 = <strong style="color: #7ef0d4;">2.04</strong><br> | |
| x̄₂ = (2.4 + 0.7 + 2.9 + 2.2 + 3.0) / 5 = <strong | |
| style="color: #7ef0d4;">2.24</strong><br><br> | |
| <strong>Centered data:</strong><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Point</th> | |
| <th style="padding: 8px;">x₁ - x̄₁</th> | |
| <th style="padding: 8px;">x₂ - x̄₂</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">A</td> | |
| <td style="text-align: center;">0.46</td> | |
| <td style="text-align: center;">0.16</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">B</td> | |
| <td style="text-align: center;">-1.54</td> | |
| <td style="text-align: center;">-1.54</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">C</td> | |
| <td style="text-align: center;">0.16</td> | |
| <td style="text-align: center;">0.66</td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">D</td> | |
| <td style="text-align: center;">-0.14</td> | |
| <td style="text-align: center;">-0.04</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">E</td> | |
| <td style="text-align: center;">1.06</td> | |
| <td style="text-align: center;">0.76</td> | |
| </tr> | |
| </table> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 2: Compute Covariance Matrix</strong><br><br> | |
| <strong>Covariance Formula:</strong> Cov(X,Y) = Σ(xᵢ-x̄)(yᵢ-ȳ) / (n-1)<br><br> | |
| <strong>Calculations:</strong><br> | |
| Var(x₁) = [(0.46)² + (-1.54)² + (0.16)² + (-0.14)² + (1.06)²] / 4 = <strong | |
| style="color: #7ef0d4;">0.92</strong><br> | |
| Var(x₂) = [(0.16)² + (-1.54)² + (0.66)² + (-0.04)² + (0.76)²] / 4 = <strong | |
| style="color: #7ef0d4;">0.85</strong><br> | |
| Cov(x₁,x₂) = [(0.46)(0.16) + (-1.54)(-1.54) + ...] / 4 = <strong | |
| style="color: #7ef0d4;">0.82</strong><br><br> | |
| <strong>Covariance Matrix:</strong><br> | |
| <pre style="color: #e8eef6; background: none; border: none; padding: 0;"> | |
| C = [0.92 0.82] | |
| [0.82 0.85]</pre> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 3: Find Eigenvalues and Eigenvectors</strong><br><br> | |
| <strong>Solve:</strong> det(C - λI) = 0<br><br> | |
| <strong>Eigenvalues (variance captured):</strong><br> | |
| λ₁ = <strong style="color: #7ef0d4;">1.71</strong> (first principal component)<br> | |
| λ₂ = <strong style="color: #ff8c6a;">0.06</strong> (second principal component)<br><br> | |
| <strong>Eigenvectors (directions):</strong><br> | |
| v₁ = [0.73, 0.68] (direction of max variance)<br> | |
| v₂ = [-0.68, 0.73] (perpendicular direction) | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 4: Calculate Variance Explained</strong><br><br> | |
| <strong>Total variance:</strong> λ₁ + λ₂ = 1.71 + 0.06 = 1.77<br><br> | |
| <strong>PC1 explains:</strong> 1.71 / 1.77 = <strong | |
| style="color: #7ef0d4; font-size: 18px;">96.6%</strong> of variance!<br> | |
| <strong>PC2 explains:</strong> 0.06 / 1.77 = 3.4% of variance<br><br> | |
| <em style="color: #a9b4c2;">Amazing! We can keep just PC1 and retain 96.6% of | |
| information!</em> | |
| </div> | |
| <div class="formula" style="background: rgba(26, 35, 50, 0.9); padding: 20px; margin: 16px 0;"> | |
| <strong style="color: #ff8c6a;">Step 5: Transform Data (Project onto PC1)</strong><br><br> | |
| <strong>Formula:</strong> z = centered_data × eigenvector<br><br> | |
| <table style="width: 100%; color: #e8eef6; margin: 10px 0; border-collapse: collapse;"> | |
| <tr style="border-bottom: 2px solid #6aa9ff;"> | |
| <th style="padding: 8px;">Point</th> | |
| <th style="padding: 8px;">Original (x₁, x₂)</th> | |
| <th style="padding: 8px;">PC1 Score</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">A</td> | |
| <td style="text-align: center;">(2.5, 2.4)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.44</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">B</td> | |
| <td style="text-align: center;">(0.5, 0.7)</td> | |
| <td style="text-align: center; color: #ff8c6a;"><strong>-2.17</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">C</td> | |
| <td style="text-align: center;">(2.2, 2.9)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>0.57</strong></td> | |
| </tr> | |
| <tr style="background: rgba(106, 169, 255, 0.05);"> | |
| <td style="padding: 6px; text-align: center;">D</td> | |
| <td style="text-align: center;">(1.9, 2.2)</td> | |
| <td style="text-align: center;"><strong>-0.13</strong></td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 6px; text-align: center;">E</td> | |
| <td style="text-align: center;">(3.1, 3.0)</td> | |
| <td style="text-align: center; color: #7ef0d4;"><strong>1.29</strong></td> | |
| </tr> | |
| </table> | |
| <strong style="color: #7ef0d4; font-size: 16px;">We reduced 2D → 1D while keeping 96.6% of | |
| information!</strong> | |
| </div> | |
| <div class="callout success" style="margin-top: 20px;"> | |
| <div class="callout-title">✓ PCA Summary</div> | |
| <div class="callout-content"> | |
| <strong>The PCA Algorithm:</strong><br> | |
| 1. <strong>Center</strong> the data (subtract mean)<br> | |
| 2. Compute <strong>covariance matrix</strong><br> | |
| 3. Find <strong>eigenvalues</strong> (importance) and <strong>eigenvectors</strong> | |
| (directions)<br> | |
| 4. Sort by eigenvalue, keep top k components<br> | |
| 5. <strong>Project</strong> data onto chosen components | |
| </div> | |
| </div> | |
| </div> | |
| <h3>How Many Components to Keep?</h3> | |
| <div class="callout info"> | |
| <div class="callout-title">💡 Choosing k</div> | |
| <div class="callout-content"> | |
| <strong>Rule of thumb:</strong> Keep components that explain 90-95% of variance<br> | |
| <strong>Elbow method:</strong> Plot cumulative variance, look for the "elbow"<br> | |
| <strong>Domain knowledge:</strong> Sometimes you know you need 2D for visualization | |
| </div> | |
| </div> | |
| <h3>Python Code</h3> | |
| <div class="formula" | |
| style="background: rgba(26, 35, 50, 0.95); padding: 20px; margin: 16px 0; font-family: monospace;"> | |
| <pre style="color: #e8eef6; margin: 0;"> | |
| <span style="color: #ff8c6a;">from</span> sklearn.decomposition <span style="color: #ff8c6a;">import</span> PCA | |
| <span style="color: #ff8c6a;">from</span> sklearn.preprocessing <span style="color: #ff8c6a;">import</span> StandardScaler | |
| <span style="color: #6aa9ff;"># Step 1: Standardize data (important!)</span> | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| <span style="color: #6aa9ff;"># Step 2: Apply PCA</span> | |
| pca = PCA(n_components=<span style="color: #7ef0d4;">2</span>) <span style="color: #6aa9ff;"># Keep 2 components</span> | |
| X_pca = pca.fit_transform(X_scaled) | |
| <span style="color: #6aa9ff;"># Check variance explained</span> | |
| <span style="color: #ff8c6a;">print</span>(pca.explained_variance_ratio_) | |
| <span style="color: #6aa9ff;"># Output: [0.72, 0.18] → 90% with 2 components</span></pre> | |
| </div> | |
| </div> | |
| </div> | |
| </main> | |
| </div> | |
| <script src="app.js?v=4"></script> | |
| </body> | |
| </html> |