prakharg24 commited on
Commit
1d745dd
·
verified ·
1 Parent(s): 82ff6e3

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +193 -29
utils.py CHANGED
@@ -1,34 +1,5 @@
1
  import streamlit as st
2
 
3
- # Define pipeline stages
4
- development_stages = [
5
- {"label": "Data Collection", "icon": "📥", "questions": [
6
- "Where will you source the data from?",
7
- "How will you ensure data quality?",
8
- "Will you balance classes?"
9
- ]},
10
- {"label": "Preprocessing", "icon": "🛠️", "questions": [
11
- "What features will you select?",
12
- "Will you impute missing values or remove them?",
13
- "How will you handle outliers?"
14
- ]},
15
- {"label": "Model Selection", "icon": "🤖", "questions": [
16
- "Which algorithms will you consider?",
17
- "Will you use pre-trained models?",
18
- "How will you handle hyperparameters?"
19
- ]},
20
- {"label": "Training", "icon": "🏋️", "questions": [
21
- "What loss function will you use?",
22
- "How will you split train/validation?",
23
- "Will you use early stopping?"
24
- ]},
25
- {"label": "Evaluation", "icon": "📊", "questions": [
26
- "What metrics will you use?",
27
- "Will you test on unseen data?",
28
- "Will you consider fairness metrics?"
29
- ]}
30
- ]
31
-
32
  def go_to(page_name, from_callback=False):
33
  """
34
  Updates the session_state page and optionally triggers a rerun.
@@ -37,3 +8,196 @@ def go_to(page_name, from_callback=False):
37
  st.session_state.page = page_name
38
  if not from_callback:
39
  st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def go_to(page_name, from_callback=False):
4
  """
5
  Updates the session_state page and optionally triggers a rerun.
 
8
  st.session_state.page = page_name
9
  if not from_callback:
10
  st.rerun()
11
+
12
+ # Define pipeline stages
13
+ pipeline_data = {
14
+ "Data Collection": {
15
+ "Data Sources": {
16
+ "Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
17
+ "Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
18
+ "Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
19
+ "Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
20
+ "Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
21
+ },
22
+ "Data Licensing & Permissions": {
23
+ "Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
24
+ "Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
25
+ "Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
26
+ "NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
27
+ "Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
28
+ },
29
+ "Data Quality Checks": {
30
+ "Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
31
+ "Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
32
+ "Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
33
+ "Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
34
+ "Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
35
+ },
36
+ "Data Volume Assessment": {
37
+ "Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
38
+ "Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
39
+ "Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
40
+ "Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
41
+ "Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
42
+ },
43
+ "Data Storage Setup": {
44
+ "Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
45
+ "File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
46
+ "Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
47
+ "Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
48
+ "Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
49
+ },
50
+ },
51
+
52
+ "Preprocessing": {
53
+ "Data Cleaning": {
54
+ "Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
55
+ "Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
56
+ "Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
57
+ "Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
58
+ "Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
59
+ },
60
+ "Feature Selection": {
61
+ "Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
62
+ "Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
63
+ "Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
64
+ "Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
65
+ "Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
66
+ },
67
+ "Feature Engineering": {
68
+ "Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
69
+ "Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
70
+ "Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
71
+ "Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
72
+ "Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
73
+ },
74
+ "Outlier Handling": {
75
+ "Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
76
+ "IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
77
+ "Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
78
+ "Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
79
+ "Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
80
+ },
81
+ "Scaling & Transformation": {
82
+ "Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
83
+ "Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
84
+ "Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
85
+ "Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
86
+ "Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
87
+ },
88
+ },
89
+
90
+ "Model Selection": {
91
+ "Algorithm Research": {
92
+ "Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
93
+ "Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
94
+ "Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
95
+ "Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
96
+ "Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
97
+ },
98
+ "Baseline Model Creation": {
99
+ "Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
100
+ "Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
101
+ "Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
102
+ "KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
103
+ "Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
104
+ },
105
+ "Pre-trained Model Exploration": {
106
+ "Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
107
+ "NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
108
+ "Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
109
+ "Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
110
+ "Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
111
+ },
112
+ "Hyperparameter Strategy": {
113
+ "Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
114
+ "Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
115
+ "Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
116
+ "Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
117
+ "Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
118
+ },
119
+ "Model Complexity Assessment": {
120
+ "Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
121
+ "FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
122
+ "Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
123
+ "Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
124
+ "Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
125
+ },
126
+ },
127
+
128
+ "Training": {
129
+ "Data Splitting": {
130
+ "Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
131
+ "Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
132
+ "Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
133
+ "Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
134
+ "Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
135
+ },
136
+ "Loss Function Choice": {
137
+ "MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
138
+ "Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
139
+ "MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
140
+ "Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
141
+ "Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
142
+ },
143
+ "Optimization Method": {
144
+ "SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
145
+ "Adam": ["Beta values?", "Weight decay?", "Warmup?"],
146
+ "RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
147
+ "Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
148
+ "L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
149
+ },
150
+ "Regularization": {
151
+ "L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
152
+ "L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
153
+ "Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
154
+ "Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
155
+ "Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
156
+ },
157
+ "Training Monitoring": {
158
+ "Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
159
+ "Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
160
+ "Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
161
+ "Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
162
+ "Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
163
+ },
164
+ },
165
+
166
+ "Evaluation": {
167
+ "Metric Selection": {
168
+ "Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
169
+ "Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
170
+ "ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
171
+ "Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
172
+ "MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
173
+ },
174
+ "Test Data Strategy": {
175
+ "Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
176
+ "External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
177
+ "Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
178
+ "Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
179
+ "Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
180
+ },
181
+ "Fairness Checks": {
182
+ "Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
183
+ "Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
184
+ "Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
185
+ "Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
186
+ "Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
187
+ },
188
+ "Robustness Testing": {
189
+ "Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
190
+ "Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
191
+ "Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
192
+ "Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
193
+ "Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
194
+ },
195
+ "Model Interpretability": {
196
+ "Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
197
+ "SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
198
+ "LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
199
+ "Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
200
+ "Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
201
+ },
202
+ },
203
+ }