Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,34 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
-
# Define pipeline stages
|
| 4 |
-
development_stages = [
|
| 5 |
-
{"label": "Data Collection", "icon": "📥", "questions": [
|
| 6 |
-
"Where will you source the data from?",
|
| 7 |
-
"How will you ensure data quality?",
|
| 8 |
-
"Will you balance classes?"
|
| 9 |
-
]},
|
| 10 |
-
{"label": "Preprocessing", "icon": "🛠️", "questions": [
|
| 11 |
-
"What features will you select?",
|
| 12 |
-
"Will you impute missing values or remove them?",
|
| 13 |
-
"How will you handle outliers?"
|
| 14 |
-
]},
|
| 15 |
-
{"label": "Model Selection", "icon": "🤖", "questions": [
|
| 16 |
-
"Which algorithms will you consider?",
|
| 17 |
-
"Will you use pre-trained models?",
|
| 18 |
-
"How will you handle hyperparameters?"
|
| 19 |
-
]},
|
| 20 |
-
{"label": "Training", "icon": "🏋️", "questions": [
|
| 21 |
-
"What loss function will you use?",
|
| 22 |
-
"How will you split train/validation?",
|
| 23 |
-
"Will you use early stopping?"
|
| 24 |
-
]},
|
| 25 |
-
{"label": "Evaluation", "icon": "📊", "questions": [
|
| 26 |
-
"What metrics will you use?",
|
| 27 |
-
"Will you test on unseen data?",
|
| 28 |
-
"Will you consider fairness metrics?"
|
| 29 |
-
]}
|
| 30 |
-
]
|
| 31 |
-
|
| 32 |
def go_to(page_name, from_callback=False):
|
| 33 |
"""
|
| 34 |
Updates the session_state page and optionally triggers a rerun.
|
|
@@ -37,3 +8,196 @@ def go_to(page_name, from_callback=False):
|
|
| 37 |
st.session_state.page = page_name
|
| 38 |
if not from_callback:
|
| 39 |
st.rerun()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
def go_to(page_name, from_callback=False):
|
| 4 |
"""
|
| 5 |
Updates the session_state page and optionally triggers a rerun.
|
|
|
|
| 8 |
st.session_state.page = page_name
|
| 9 |
if not from_callback:
|
| 10 |
st.rerun()
|
| 11 |
+
|
| 12 |
+
# Define pipeline stages
|
| 13 |
+
pipeline_data = {
|
| 14 |
+
"Data Collection": {
|
| 15 |
+
"Data Sources": {
|
| 16 |
+
"Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
|
| 17 |
+
"Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
|
| 18 |
+
"Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
|
| 19 |
+
"Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
|
| 20 |
+
"Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
|
| 21 |
+
},
|
| 22 |
+
"Data Licensing & Permissions": {
|
| 23 |
+
"Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
|
| 24 |
+
"Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
|
| 25 |
+
"Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
|
| 26 |
+
"NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
|
| 27 |
+
"Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
|
| 28 |
+
},
|
| 29 |
+
"Data Quality Checks": {
|
| 30 |
+
"Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
|
| 31 |
+
"Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
|
| 32 |
+
"Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
|
| 33 |
+
"Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
|
| 34 |
+
"Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
|
| 35 |
+
},
|
| 36 |
+
"Data Volume Assessment": {
|
| 37 |
+
"Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
|
| 38 |
+
"Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
|
| 39 |
+
"Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
|
| 40 |
+
"Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
|
| 41 |
+
"Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
|
| 42 |
+
},
|
| 43 |
+
"Data Storage Setup": {
|
| 44 |
+
"Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
|
| 45 |
+
"File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
|
| 46 |
+
"Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
|
| 47 |
+
"Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
|
| 48 |
+
"Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
|
| 49 |
+
},
|
| 50 |
+
},
|
| 51 |
+
|
| 52 |
+
"Preprocessing": {
|
| 53 |
+
"Data Cleaning": {
|
| 54 |
+
"Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
|
| 55 |
+
"Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
|
| 56 |
+
"Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
|
| 57 |
+
"Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
|
| 58 |
+
"Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
|
| 59 |
+
},
|
| 60 |
+
"Feature Selection": {
|
| 61 |
+
"Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
|
| 62 |
+
"Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
|
| 63 |
+
"Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
|
| 64 |
+
"Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
|
| 65 |
+
"Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
|
| 66 |
+
},
|
| 67 |
+
"Feature Engineering": {
|
| 68 |
+
"Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
|
| 69 |
+
"Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
|
| 70 |
+
"Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
|
| 71 |
+
"Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
|
| 72 |
+
"Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
|
| 73 |
+
},
|
| 74 |
+
"Outlier Handling": {
|
| 75 |
+
"Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
|
| 76 |
+
"IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
|
| 77 |
+
"Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
|
| 78 |
+
"Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
|
| 79 |
+
"Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
|
| 80 |
+
},
|
| 81 |
+
"Scaling & Transformation": {
|
| 82 |
+
"Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
|
| 83 |
+
"Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
|
| 84 |
+
"Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
|
| 85 |
+
"Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
|
| 86 |
+
"Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
|
| 87 |
+
},
|
| 88 |
+
},
|
| 89 |
+
|
| 90 |
+
"Model Selection": {
|
| 91 |
+
"Algorithm Research": {
|
| 92 |
+
"Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
|
| 93 |
+
"Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
|
| 94 |
+
"Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
|
| 95 |
+
"Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
|
| 96 |
+
"Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
|
| 97 |
+
},
|
| 98 |
+
"Baseline Model Creation": {
|
| 99 |
+
"Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
|
| 100 |
+
"Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
|
| 101 |
+
"Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
|
| 102 |
+
"KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
|
| 103 |
+
"Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
|
| 104 |
+
},
|
| 105 |
+
"Pre-trained Model Exploration": {
|
| 106 |
+
"Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
|
| 107 |
+
"NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
|
| 108 |
+
"Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
|
| 109 |
+
"Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
|
| 110 |
+
"Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
|
| 111 |
+
},
|
| 112 |
+
"Hyperparameter Strategy": {
|
| 113 |
+
"Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
|
| 114 |
+
"Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
|
| 115 |
+
"Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
|
| 116 |
+
"Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
|
| 117 |
+
"Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
|
| 118 |
+
},
|
| 119 |
+
"Model Complexity Assessment": {
|
| 120 |
+
"Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
|
| 121 |
+
"FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
|
| 122 |
+
"Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
|
| 123 |
+
"Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
|
| 124 |
+
"Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
|
| 125 |
+
},
|
| 126 |
+
},
|
| 127 |
+
|
| 128 |
+
"Training": {
|
| 129 |
+
"Data Splitting": {
|
| 130 |
+
"Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
|
| 131 |
+
"Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
|
| 132 |
+
"Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
|
| 133 |
+
"Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
|
| 134 |
+
"Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
|
| 135 |
+
},
|
| 136 |
+
"Loss Function Choice": {
|
| 137 |
+
"MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
|
| 138 |
+
"Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
|
| 139 |
+
"MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
|
| 140 |
+
"Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
|
| 141 |
+
"Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
|
| 142 |
+
},
|
| 143 |
+
"Optimization Method": {
|
| 144 |
+
"SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
|
| 145 |
+
"Adam": ["Beta values?", "Weight decay?", "Warmup?"],
|
| 146 |
+
"RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
|
| 147 |
+
"Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
|
| 148 |
+
"L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
|
| 149 |
+
},
|
| 150 |
+
"Regularization": {
|
| 151 |
+
"L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
|
| 152 |
+
"L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
|
| 153 |
+
"Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
|
| 154 |
+
"Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
|
| 155 |
+
"Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
|
| 156 |
+
},
|
| 157 |
+
"Training Monitoring": {
|
| 158 |
+
"Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
|
| 159 |
+
"Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
|
| 160 |
+
"Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
|
| 161 |
+
"Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
|
| 162 |
+
"Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
|
| 163 |
+
},
|
| 164 |
+
},
|
| 165 |
+
|
| 166 |
+
"Evaluation": {
|
| 167 |
+
"Metric Selection": {
|
| 168 |
+
"Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
|
| 169 |
+
"Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
|
| 170 |
+
"ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
|
| 171 |
+
"Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
|
| 172 |
+
"MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
|
| 173 |
+
},
|
| 174 |
+
"Test Data Strategy": {
|
| 175 |
+
"Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
|
| 176 |
+
"External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
|
| 177 |
+
"Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
|
| 178 |
+
"Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
|
| 179 |
+
"Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
|
| 180 |
+
},
|
| 181 |
+
"Fairness Checks": {
|
| 182 |
+
"Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
|
| 183 |
+
"Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
|
| 184 |
+
"Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
|
| 185 |
+
"Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
|
| 186 |
+
"Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
|
| 187 |
+
},
|
| 188 |
+
"Robustness Testing": {
|
| 189 |
+
"Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
|
| 190 |
+
"Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
|
| 191 |
+
"Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
|
| 192 |
+
"Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
|
| 193 |
+
"Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
|
| 194 |
+
},
|
| 195 |
+
"Model Interpretability": {
|
| 196 |
+
"Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
|
| 197 |
+
"SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
|
| 198 |
+
"LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
|
| 199 |
+
"Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
|
| 200 |
+
"Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
|
| 201 |
+
},
|
| 202 |
+
},
|
| 203 |
+
}
|