shaikhsalman commited on
Commit
9d4d5c7
Β·
verified Β·
1 Parent(s): 36df1e5

refactor: merged structure - model at center, DevSecOps wrapped around it

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. README.md +60 -129
  2. ai-ml/hf-finetuning/TRAINING_RECIPE.md +0 -58
  3. ai-ml/hf-finetuning/run_finetune.py +0 -67
  4. ci-cd/gitlab-ci/.gitlab-ci.yml +0 -113
  5. ci-cd/jenkins/Jenkinsfile +0 -136
  6. compliance/{cis-benchmarks/cis-eks-k8s.yaml β†’ cis-eks-k8s.yaml} +0 -0
  7. compliance/{soc2/controls-mapping.yaml β†’ controls-mapping.yaml} +0 -0
  8. compliance/{nist/nist-800-53-mapping.yaml β†’ nist-800-53-mapping.yaml} +0 -0
  9. {docker/base-images β†’ deployment}/Dockerfile.ml-inference +0 -0
  10. {k8s/workloads/ml-pipeline β†’ deployment}/deployment.yaml +0 -0
  11. {ai-ml/mlflow β†’ deployment}/mlflow-deployment.yaml +0 -0
  12. docker/base-images/Dockerfile.backend +0 -51
  13. docker/base-images/Dockerfile.frontend +0 -33
  14. finops/cost-optimization.yaml +0 -73
  15. finops/finops-cronjob.yaml +0 -23
  16. finops/finops-policy.yaml +0 -73
  17. incident-response/auto-remediation/auto-remediate.sh +0 -50
  18. platform/adr/template.md β†’ infrastructure/adr-template.md +0 -0
  19. {ci-cd/github-actions β†’ infrastructure/ci-cd}/devsecops-pipeline.yml +0 -0
  20. {finops β†’ infrastructure}/finops_scanner.py +0 -0
  21. {scripts/bash β†’ infrastructure}/incident-response.sh +0 -0
  22. {monitoring β†’ infrastructure/monitoring}/alertmanager/alertmanager-config.yaml +0 -0
  23. monitoring/grafana/dashboards/platform-overview.json β†’ infrastructure/monitoring/grafana-platform-overview.json +0 -0
  24. {monitoring β†’ infrastructure/monitoring}/otel/otel-collector.yaml +0 -0
  25. {monitoring β†’ infrastructure/monitoring}/prometheus/alerts.yaml +0 -0
  26. incident-response/postmortem/template.md β†’ infrastructure/postmortem-template.md +0 -0
  27. {terraform β†’ infrastructure/terraform}/environments/prod/main.tf +0 -0
  28. {terraform β†’ infrastructure/terraform}/modules/eks/main.tf +0 -0
  29. {terraform β†’ infrastructure/terraform}/modules/eks/outputs.tf +0 -0
  30. {terraform β†’ infrastructure/terraform}/modules/eks/variables.tf +0 -0
  31. {terraform β†’ infrastructure/terraform}/modules/guardduty/main.tf +0 -0
  32. {terraform β†’ infrastructure/terraform}/modules/guardduty/variables.tf +0 -0
  33. {terraform β†’ infrastructure/terraform}/modules/iam/main.tf +0 -0
  34. {terraform β†’ infrastructure/terraform}/modules/kms/main.tf +0 -0
  35. {terraform β†’ infrastructure/terraform}/modules/macie/main.tf +0 -0
  36. {terraform β†’ infrastructure/terraform}/modules/rds/main.tf +0 -0
  37. {terraform β†’ infrastructure/terraform}/modules/rds/variables.tf +0 -0
  38. {terraform β†’ infrastructure/terraform}/modules/s3/main.tf +0 -0
  39. {terraform β†’ infrastructure/terraform}/modules/s3/variables.tf +0 -0
  40. {terraform β†’ infrastructure/terraform}/modules/vpc/main.tf +0 -0
  41. {terraform β†’ infrastructure/terraform}/modules/vpc/outputs.tf +0 -0
  42. {terraform β†’ infrastructure/terraform}/modules/vpc/variables.tf +0 -0
  43. k8s/base/limit-ranges/limit-ranges.yaml +0 -74
  44. k8s/base/namespaces/namespaces.yaml +0 -69
  45. k8s/base/network-policies/network-policies.yaml +0 -124
  46. k8s/base/pdbs/pdbs.yaml +0 -62
  47. k8s/base/rbac/rbac.yaml +0 -78
  48. k8s/base/resource-quotas/resource-quotas.yaml +0 -50
  49. k8s/base/slos/slos.yaml +0 -68
  50. k8s/kustomize/base/kustomization.yaml +0 -18
README.md CHANGED
@@ -1,144 +1,75 @@
1
- # DevSecOps Platform OMEGA β€” Enterprise AI Operating System
 
 
 
 
 
 
 
 
 
 
2
 
3
- > Production-grade, security-first, automation-first platform covering the full DevOps, Cloud, Kubernetes, Security, AI/ML, FinOps, and Governance lifecycle.
4
 
5
- **156 files | 182KB | 13 domains | All production-ready**
6
 
7
- ## Architecture
8
 
9
- ```
10
- ENGINEERING COMMAND CENTER
11
- |
12
- +------------------+------------------+
13
- | | | | |
14
- RELIABILITY SECURITY FINOPS PLATFORM AI/ML
15
- (SLO/PDB) (GuardDuty) (Cost) (Golden (RAG/SFT)
16
- | | | Path) |
17
- +---------+--------+--------+---------+--+
18
- | |
19
- KUBERNETES TERRAFORM
20
- (Kustomize) (IaC Modules)
21
- | |
22
- AWS CLOUD INFRASTRUCTURE
23
- ```
24
-
25
- ## OMEGA 10-Dimension Scorecard
26
-
27
- | # | Dimension | Score | Assets |
28
- |---|-----------|-------|--------|
29
- | 1 | **Reliability** | 8/10 | PDBs, SLOs, HPA, multi-AZ, Istio |
30
- | 2 | **Security** | 9/10 | GuardDuty, Macie, Falco, Kyverno, Trivy, mTLS |
31
- | 3 | **Dev Velocity** | 7/10 | Golden paths, self-service envs, Kustomize |
32
- | 4 | **Cost Efficiency** | 7/10 | FinOps scanner, spot instances, scheduling policy |
33
- | 5 | **Governance** | 8/10 | SOC2, NIST 800-53, CIS, OPA, ADR template |
34
- | 6 | **Automation** | 7/10 | Bootstrap, auto-remediation, GitOps (ArgoCD) |
35
- | 7 | **Incident Recovery** | 8/10 | Runbook, postmortem template, war-room |
36
- | 8 | **Standardization** | 8/10 | Kustomize overlays, golden path templates |
37
- | 9 | **AI Enablement** | 8/10 | RAG, LoRA v2, MLflow, Trackio, GPU scheduling |
38
- | 10 | **Engineering Excellence** | 7/10 | ADR template, checklists, SRE standards |
39
-
40
- ## Platform Modules
41
-
42
- ### Infrastructure (Terraform)
43
- | Module | Purpose | Key Feature |
44
- |--------|---------|-------------|
45
- | VPC | Network isolation | Flow logs, default deny SG/NACL |
46
- | EKS | Kubernetes cluster | Private API, KMS encryption, IRSA |
47
- | RDS | Database | Multi-AZ, encrypted, performance insights |
48
- | S3 | Storage | SSE-KMS, versioning, lifecycle |
49
- | IAM | Access control | MFA, least privilege, access analyzer |
50
- | KMS | Key management | Auto-rotation, multi-key |
51
- | GuardDuty | Threat detection | EBS malware scan, K8s audit, S3 |
52
- | Macie | PII detection | Automated data classification |
53
-
54
- ### Kubernetes
55
- | Layer | Components |
56
- |-------|-----------|
57
- | **Base** | Namespaces, RBAC, NetPols, Quotas, Limits, PDBs, SLOs |
58
- | **Platform** | ArgoCD, Istio (mTLS), ExternalSecrets, CertManager |
59
- | **Security** | Trivy Operator, Falco (eBPF), Kyverno (7 policies), OPA |
60
- | **Observability** | Prometheus, Grafana, Loki, Alertmanager, OTEL |
61
- | **Workloads** | Frontend, Backend (HPA), ML Pipeline (GPU) |
62
-
63
- ### FinOps Engine
64
- | Asset | Purpose |
65
- |-------|---------|
66
- | finops-policy.yaml | 11 cost optimization rules |
67
- | finops_scanner.py | Automated waste detection |
68
- | cost-optimization.yaml | Spot instance strategy + KEDA |
69
- | finops-cronjob.yaml | Daily cost scan CronJob |
70
-
71
- ### Platform Engineering
72
- | Asset | Purpose |
73
- |-------|---------|
74
- | golden-paths/microservice/ | Production-ready service template + checklist |
75
- | self-service/ | Ephemeral environment provisioning config |
76
- | adr/template.md | Architecture Decision Record template |
77
- | kustomize/ | Base + dev/staging/prod overlays |
78
-
79
- ### Incident Response
80
- | Asset | Purpose |
81
- |-------|---------|
82
- | auto-remediate.sh | OOM fix, pod restart, security escalation |
83
- | postmortem/template.md | Full postmortem with 5 Whys + action items |
84
- | incident-response.sh | Diagnostic runbook (5 incident types) |
85
-
86
- ### AI/ML Hub
87
- | Asset | Purpose |
88
- |-------|---------|
89
- | finetune.py | LoRA Without Regret (r=256, all-linear) |
90
- | run_finetune.py | CLI entry point with dataset selection |
91
- | TRAINING_RECIPE.md | v1β†’v2 upgrade documentation |
92
- | rag_pipeline.py | LangChain + HF + ChromaDB RAG |
93
- | mlflow/ | MLflow tracking deployment |
94
-
95
- ### Compliance
96
- | Framework | Coverage |
97
- |-----------|---------|
98
- | SOC2 Type II | CC6-CC9 controls mapped |
99
- | NIST 800-53 | 12 controls mapped |
100
- | CIS Benchmarks | EKS + K8s automated |
101
- | OPA Gatekeeper | Admission policies |
102
-
103
- ### CI/CD Pipelines
104
- | System | Features |
105
- |--------|----------|
106
- | GitHub Actions | 6-stage DevSecOps (SAST→Build→Scan→Test→Sign→Deploy) |
107
- | Jenkins | Parallel SAST + production deployment |
108
- | GitLab CI | Full scan + sign + deploy pipeline |
109
-
110
- ## Quick Start
111
-
112
- ```bash
113
- # Bootstrap full platform
114
- ./scripts/bash/bootstrap.sh prod
115
 
116
- # Security audit
117
- python3 scripts/python/security_audit.py
118
 
119
- # FinOps cost scan
120
- python3 finops/finops_scanner.py
121
 
122
- # Incident response
123
- ./scripts/bash/incident-response.sh security
124
 
125
- # Auto-remediate
126
- ./incident-response/auto-remediation/auto-remediate.sh PodCrashLooping backend <pod-name>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ```
128
 
129
- ## Self-Improvement Checklist
130
 
131
- After every deployment, ask:
 
 
132
 
133
- - [ ] Can this be automated?
134
- - [ ] Can this be templated?
135
- - [ ] Can this be secured further?
136
- - [ ] Can this be cheaper?
137
- - [ ] Can this scale better?
138
- - [ ] Can this reduce human toil?
139
 
140
- If yes, enhance and push.
 
141
 
142
- ## Hub
 
143
 
144
- **[huggingface.co/shaikhsalman/devsecops-platform](https://huggingface.co/shaikhsalman/devsecops-platform)**
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - devsecops
5
+ - llm
6
+ - sft
7
+ - lora
8
+ - tulu-3
9
+ - kubernetes
10
+ - terraform
11
+ ---
12
 
13
+ # DevSecOps Model Platform
14
 
15
+ > Train a secure model on the best data, then deploy it securely.
16
 
17
+ ## Start Here: Train Your Model
18
 
19
+ | Dataset | Size | What It Gives You | Command |
20
+ |---------|------|-------------------|---------|
21
+ | **tulu-3-sft-mixture** | 940K | Math, code, safety, chat (BEST) | python model/train_tulu3.py |
22
+ | **OpenThoughts-114k** | 114K | Reasoning, chain-of-thought | python model/train_openthoughts.py |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ **allenai/tulu-3-sft-mixture** is from Allen AI Tulu 3 - current SOTA open instruction-tuned model. Proven on Llama-3.1-8B: MMLU 53.5, GSM8K 79.9, HumanEval 76.8.
 
25
 
26
+ LoRA config from LoRA Without Regret (Schulman 2025): r=256, alpha=16, all-linear = matches full fine-tuning at 67% compute.
 
27
 
28
+ ## Repository Structure
 
29
 
30
+ ```
31
+ model/ THE MODEL - train, serve, enhance
32
+ train_tulu3.py Primary: 940K best data (zero preprocessing)
33
+ train_openthoughts.py Reasoning: 114K CoT traces
34
+ finetune_configurable.py Multi-dataset configurable trainer
35
+ rag_pipeline.py RAG for DevSecOps knowledge
36
+ DATASETS.md Why these datasets, proven recipes
37
+
38
+ deployment/ SERVE IT - Kubernetes + Docker + vLLM
39
+ deployment.yaml ML inference K8s manifest
40
+ mlflow-deployment.yaml Experiment tracking
41
+ Dockerfile.ml-inference Hardened multi-stage image
42
+
43
+ security/ PROTECT IT - scanning + policies
44
+ scanning/ Trivy, Semgrep, Checkov, SBOM
45
+ policies/ Kyverno, OPA Gatekeeper
46
+
47
+ infrastructure/ RUN IT - Terraform + monitoring + CI/CD
48
+ terraform/ VPC, EKS, RDS, S3, IAM, KMS, GuardDuty, Macie
49
+ monitoring/ Prometheus, Alertmanager, OTEL, Grafana
50
+ ci-cd/ GitHub Actions DevSecOps pipeline
51
+
52
+ compliance/ CERTIFY IT - SOC2, NIST, CIS
53
+ controls-mapping.yaml SOC2 Type II
54
+ nist-800-53-mapping.yaml NIST 800-53 Rev5
55
+ cis-eks-k8s.yaml CIS Benchmarks
56
  ```
57
 
58
+ ## Quick Commands
59
 
60
+ ```bash
61
+ # Train on best data (A100, ~6h)
62
+ python model/train_tulu3.py
63
 
64
+ # Quick test (any GPU)
65
+ python model/train_tulu3.py --max_steps 100 --no_push
 
 
 
 
66
 
67
+ # Security scan
68
+ python security/scanning/security_audit.py
69
 
70
+ # Deploy model to K8s
71
+ kubectl apply -f deployment/deployment.yaml
72
 
73
+ # Infrastructure (Terraform)
74
+ cd infrastructure/terraform/environments/prod && terraform apply
75
+ ```
ai-ml/hf-finetuning/TRAINING_RECIPE.md DELETED
@@ -1,58 +0,0 @@
1
- # Model Enhancement β€” Dataset & Training Recipe vNext
2
-
3
- ## What Changed (v1 β†’ v2)
4
-
5
- | Parameter | v1 (Old) | v2 (LoRA Without Regret) | Why |
6
- |-----------|----------|--------------------------|-----|
7
- | **Dataset** | ultrachat_200k (5K subset) | **tulu-3-sft-mixture** (940K) | 19 curated sources > single source |
8
- | **LoRA r** | 16 | **256** | SFT-scale datasets need r=256 to match full FT |
9
- | **LoRA alpha** | 32 | **16** | Stable scaling with high rank |
10
- | **Target modules** | q/k/v/o_proj only | **all-linear** | Attention-only underperforms even at higher rank |
11
- | **Effective batch** | 32 | **16** | LoRA less tolerant of large batches |
12
- | **Learning rate** | 2e-4 | **2e-4** (same) | 10x full FT rate β€” correct in v1 |
13
- | **Packing** | False | **True (bfd_split)** | Preserves all tokens, 2-3x throughput |
14
- | **assistant_only_loss** | False | **True** | Loss only on assistant tokens |
15
- | **EOS token** | Not set | **<\|eot_id\|>** | Llama 3.1 chat template |
16
- | **LR scheduler** | linear | **cosine** | Better convergence for LoRA |
17
- | **Epochs** | 3 | **1** | 940K examples = 1 epoch sufficient |
18
-
19
- ## Dataset Comparison
20
-
21
- | Dataset | Size | Format | Best For | Quality |
22
- |---------|------|--------|----------|---------|
23
- | **tulu-3-sft-mixture** | 940K | messages βœ… | General SFT (code, math, IF, safety, science) | ⭐⭐⭐⭐⭐ |
24
- | **OpenThoughts-114k** | 114K | conversations (needs conversion) | Reasoning, CoT traces | ⭐⭐⭐⭐ |
25
- | ultrachat_200k | 200K | messages βœ… | Multi-turn chat baseline | ⭐⭐⭐ |
26
-
27
- ## Key Research: "LoRA Without Regret" (Schulman et al., 2025)
28
-
29
- Four findings that change how we fine-tune:
30
-
31
- 1. **Target ALL linear layers** β€” not just attention. Increasing rank does NOT compensate for skipping layers.
32
- 2. **Use r=256 for SFT** β€” sufficient capacity for post-training scale datasets.
33
- 3. **Use 10x higher LR** (2e-4 vs 2e-5 for full FT) β€” 1/r scaling makes optimal LR rank-independent.
34
- 4. **Keep batch size < 32** β€” LoRA is less tolerant of large batches. Cannot be mitigated by increasing rank.
35
-
36
- ## Recommended Training Matrix
37
-
38
- ### SFT (Supervised Fine-Tuning)
39
-
40
- | Model | Dataset | Hardware | Time | Cost |
41
- |-------|---------|----------|------|------|
42
- | Llama-3.1-8B-Instruct | tulu-3-sft (940K) | A100 (80GB) | ~6h | ~$24 |
43
- | Llama-3.1-8B-Instruct | OpenThoughts-114k | A100 (80GB) | ~2h | ~$8 |
44
- | Llama-3.1-8B-Instruct | tulu-3-sft (940K) | A10G (24GB) + QLoRA | ~12h | ~$24 |
45
-
46
- ### GRPO (Reinforcement Learning)
47
-
48
- | Model | Dataset | LoRA r | Hardware |
49
- |-------|---------|--------|----------|
50
- | Qwen3-0.6B | OpenR1-Math-220k | 1 | A100 |
51
- | Llama-3.1-8B-Base | GSM8k | 1-32 | A100 |
52
-
53
- ## Source Attribution
54
-
55
- - LoRA Without Regret: Schulman et al., 2025, Thinking Machines Lab
56
- - tulu-3-sft-mixture: Allen AI, used by Tulu 3 (SOTA open instruction-tuned)
57
- - OpenThoughts-114k: Open community, reasoning-heavy CoT data
58
- - LoRA Land: Predibase 2024, 224/310 LoRA models surpassed GPT-4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ai-ml/hf-finetuning/run_finetune.py DELETED
@@ -1,67 +0,0 @@
1
- # =============================================================================
2
- # SFT Fine-Tuning β€” CLI Entry Point (LoRA Without Regret config)
3
- # =============================================================================
4
- # Usage:
5
- # # Default: tulu-3-sft + Llama-3.1-8B
6
- # python run_finetune.py
7
- #
8
- # # OpenThoughts reasoning dataset
9
- # python run_finetune.py --dataset_key openthoughts-114k
10
- #
11
- # # Ultrachat fallback
12
- # python run_finetune.py --dataset_key ultrachat-200k
13
- #
14
- # # Custom hub model ID
15
- # python run_finetune.py --hub_model_id my-org/my-model-v2
16
- # =============================================================================
17
-
18
- import argparse
19
- import sys
20
- from finetune import FinetuneConfig, finetune, DATASET_REGISTRY
21
-
22
-
23
- def main():
24
- parser = argparse.ArgumentParser(description="SFT Fine-Tuning (LoRA Without Regret)")
25
- parser.add_argument("--dataset_key", default="tulu-3-sft",
26
- choices=list(DATASET_REGISTRY.keys()),
27
- help="Dataset to train on")
28
- parser.add_argument("--hub_model_id", default=None,
29
- help="HuggingFace Hub model ID for push")
30
- parser.add_argument("--num_train_epochs", type=int, default=None)
31
- parser.add_argument("--learning_rate", type=float, default=None)
32
- parser.add_argument("--lora_r", type=int, default=None)
33
- parser.add_argument("--per_device_train_batch_size", type=int, default=None)
34
- parser.add_argument("--max_seq_length", type=int, default=None)
35
-
36
- args = parser.parse_args()
37
-
38
- config = FinetuneConfig()
39
- if args.dataset_key:
40
- config.dataset_key = args.dataset_key
41
- if args.hub_model_id:
42
- config.hub_model_id = args.hub_model_id
43
- if args.num_train_epochs:
44
- config.num_train_epochs = args.num_train_epochs
45
- if args.learning_rate:
46
- config.learning_rate = args.learning_rate
47
- if args.lora_r:
48
- config.lora_r = args.lora_r
49
- if args.per_device_train_batch_size:
50
- config.per_device_train_batch_size = args.per_device_train_batch_size
51
- if args.max_seq_length:
52
- config.max_seq_length = args.max_seq_length
53
-
54
- print(f"Config: model={config.model_name}")
55
- print(f" dataset={config.dataset_key}")
56
- print(f" lora_r={config.lora_r}, lora_alpha={config.lora_alpha}")
57
- print(f" target_modules={config.target_modules}")
58
- print(f" lr={config.learning_rate}, epochs={config.num_train_epochs}")
59
- print(f" effective_batch={config.per_device_train_batch_size * config.gradient_accumulation_steps}")
60
- print(f" packing={config.packing}, strategy={config.packing_strategy}")
61
- print(f" assistant_only_loss={config.assistant_only_loss}")
62
-
63
- finetune(config)
64
-
65
-
66
- if __name__ == "__main__":
67
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ci-cd/gitlab-ci/.gitlab-ci.yml DELETED
@@ -1,113 +0,0 @@
1
- # =============================================================================
2
- # GitLab CI β€” DevSecOps Pipeline
3
- # =============================================================================
4
-
5
- stages:
6
- - sast
7
- - build
8
- - scan
9
- - test
10
- - sign
11
- - deploy
12
-
13
- variables:
14
- REGISTRY: ecr.aws/devsecops
15
- TRIVY_SEVERITY: "CRITICAL,HIGH"
16
-
17
- # --- SAST Stage ---
18
- semgrep:
19
- stage: sast
20
- image: semgrep/semgrep:latest
21
- script:
22
- - semgrep --config auto --json --output semgrep.json .
23
- artifacts:
24
- paths:
25
- - semgrep.json
26
-
27
- secret-scan:
28
- stage: sast
29
- image: aquasec/trivy:latest
30
- script:
31
- - trivy fs --scanners secret --exit-code 1 .
32
-
33
- checkov:
34
- stage: sast
35
- image: bridgecrew/checkov:latest
36
- script:
37
- - checkov -d terraform/ --output cli
38
-
39
- # --- Build Stage ---
40
- build:
41
- stage: build
42
- image: docker:24
43
- services:
44
- - docker:24-dind
45
- before_script:
46
- - aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY
47
- script:
48
- - |
49
- docker build \
50
- --build-arg BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) \
51
- -t $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA \
52
- -t $REGISTRY/$CI_PROJECT_NAME:latest .
53
- - docker push $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
54
-
55
- # --- Scan Stage ---
56
- trivy-scan:
57
- stage: scan
58
- image: aquasec/trivy:latest
59
- needs: [build]
60
- script:
61
- - trivy image --severity $TRIVY_SEVERITY --exit-code 1 --ignore-unfixed $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
62
-
63
- generate-sbom:
64
- stage: scan
65
- image: anchore/syft:latest
66
- needs: [build]
67
- script:
68
- - syft $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -o spdx-json > sbom.spdx.json
69
- artifacts:
70
- paths:
71
- - sbom.spdx.json
72
-
73
- # --- Test Stage ---
74
- integration-test:
75
- stage: test
76
- image: docker:24
77
- services:
78
- - docker:24-dind
79
- script:
80
- - docker compose -f docker-compose.test.yml up --abort-on-container-exit
81
-
82
- # --- Sign Stage ---
83
- sign:
84
- stage: sign
85
- image: bitnami/cosign:latest
86
- needs: [build, trivy-scan, generate-sbom]
87
- variables:
88
- COSIGN_EXPERIMENTAL: "1"
89
- script:
90
- - cosign sign --yes $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
91
- - cosign attest --yes --predicate sbom.spdx.json --type spdxjson $REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA
92
-
93
- # --- Deploy Stage ---
94
- deploy-staging:
95
- stage: deploy
96
- image: bitnami/kubectl:latest
97
- needs: [sign, integration-test]
98
- environment:
99
- name: staging
100
- script:
101
- - kubectl set image deployment/$CI_PROJECT_NAME $CI_PROJECT_NAME=$REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -n staging
102
- - kubectl rollout status deployment/$CI_PROJECT_NAME -n staging --timeout=300s
103
-
104
- deploy-prod:
105
- stage: deploy
106
- image: bitnami/kubectl:latest
107
- needs: [deploy-staging]
108
- environment:
109
- name: production
110
- when: manual
111
- script:
112
- - kubectl set image deployment/$CI_PROJECT_NAME $CI_PROJECT_NAME=$REGISTRY/$CI_PROJECT_NAME:$CI_COMMIT_SHORT_SHA -n production
113
- - kubectl rollout status deployment/$CI_PROJECT_NAME -n production --timeout=300s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ci-cd/jenkins/Jenkinsfile DELETED
@@ -1,136 +0,0 @@
1
- // =============================================================================
2
- // Jenkinsfile β€” Shared DevSecOps Pipeline
3
- // =============================================================================
4
-
5
- pipeline {
6
- agent { label 'docker' }
7
-
8
- environment {
9
- REGISTRY = 'ecr.aws/devsecops'
10
- IMAGE_NAME = "${env.JOB_NAME.split('/').last()}"
11
- IMAGE_TAG = "${env.GIT_COMMIT.take(12)}"
12
- TRIVY_SEVERITY = 'CRITICAL,HIGH'
13
- }
14
-
15
- stages {
16
- // ----- Stage 1: SAST -----
17
- stage('SAST') {
18
- parallel {
19
- stage('Semgrep') {
20
- steps {
21
- sh 'semgrep --config auto --json --output semgrep.json .'
22
- }
23
- }
24
- stage('Secret Scan') {
25
- steps {
26
- sh 'trivy fs --scanners secret --exit-code 1 .'
27
- }
28
- }
29
- stage('IaC Scan') {
30
- steps {
31
- sh 'checkov -d terraform/ --output cli --soft-fail false'
32
- }
33
- }
34
- }
35
- }
36
-
37
- // ----- Stage 2: Build -----
38
- stage('Build') {
39
- steps {
40
- script {
41
- docker.withRegistry("https://${REGISTRY}", 'ecr:us-east-1') {
42
- def app = docker.build(
43
- "${IMAGE_NAME}:${IMAGE_TAG}",
44
- '--build-arg BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) .'
45
- )
46
- app.push()
47
- app.push('latest')
48
- }
49
- }
50
- }
51
- }
52
-
53
- // ----- Stage 3: Container Scan -----
54
- stage('Security Scan') {
55
- steps {
56
- sh """
57
- trivy image \
58
- --severity ${TRIVY_SEVERITY} \
59
- --exit-code 1 \
60
- --ignore-unfixed \
61
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
62
- """
63
- // Generate SBOM
64
- sh """
65
- syft ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
66
- -o cyclonedx-json > sbom.cyclonedx.json
67
- """
68
- }
69
- }
70
-
71
- // ----- Stage 4: Test -----
72
- stage('Integration Test') {
73
- steps {
74
- sh 'docker compose -f docker-compose.test.yml up --abort-on-container-exit'
75
- }
76
- }
77
-
78
- // ----- Stage 5: Sign -----
79
- stage('Sign & Attest') {
80
- steps {
81
- sh """
82
- cosign sign --yes \
83
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
84
- cosign attest --yes \
85
- --predicate sbom.cyclonedx.json \
86
- --type cyclonedx \
87
- ${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}
88
- """
89
- }
90
- }
91
-
92
- // ----- Stage 6: Deploy -----
93
- stage('Deploy Staging') {
94
- steps {
95
- sh """
96
- kubectl set image deployment/${IMAGE_NAME} \
97
- ${IMAGE_NAME}=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
98
- -n staging
99
- """
100
- // Wait for rollout
101
- sh 'kubectl rollout status deployment/${IMAGE_NAME} -n staging --timeout=300s'
102
- }
103
- }
104
-
105
- stage('Deploy Production') {
106
- when {
107
- branch 'main'
108
- }
109
- input {
110
- message "Deploy ${IMAGE_NAME}:${IMAGE_TAG} to production?"
111
- }
112
- steps {
113
- sh """
114
- kubectl set image deployment/${IMAGE_NAME} \
115
- ${IMAGE_NAME}=${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} \
116
- -n production
117
- """
118
- sh 'kubectl rollout status deployment/${IMAGE_NAME} -n production --timeout=300s'
119
- }
120
- }
121
- }
122
-
123
- post {
124
- always {
125
- archiveArtifacts artifacts: 'semgrep.json, sbom.cyclonedx.json', allowEmptyArchive: true
126
- recordIssues(tools: [semgrep(pattern: 'semgrep.json')])
127
- }
128
- failure {
129
- slackSend(
130
- channel: '#platform-alerts',
131
- color: 'danger',
132
- message: "FAILED: ${env.JOB_NAME} #${env.BUILD_NUMBER}"
133
- )
134
- }
135
- }
136
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
compliance/{cis-benchmarks/cis-eks-k8s.yaml β†’ cis-eks-k8s.yaml} RENAMED
File without changes
compliance/{soc2/controls-mapping.yaml β†’ controls-mapping.yaml} RENAMED
File without changes
compliance/{nist/nist-800-53-mapping.yaml β†’ nist-800-53-mapping.yaml} RENAMED
File without changes
{docker/base-images β†’ deployment}/Dockerfile.ml-inference RENAMED
File without changes
{k8s/workloads/ml-pipeline β†’ deployment}/deployment.yaml RENAMED
File without changes
{ai-ml/mlflow β†’ deployment}/mlflow-deployment.yaml RENAMED
File without changes
docker/base-images/Dockerfile.backend DELETED
@@ -1,51 +0,0 @@
1
- # =============================================================================
2
- # Multi-Stage Hardened Dockerfile β€” Python Backend
3
- # =============================================================================
4
- # Security Features:
5
- # - Multi-stage build (build β†’ runtime)
6
- # - Non-root user
7
- # - Minimal base (distroless)
8
- # - Pinned versions
9
- # - No shell in runtime image
10
- # - Health check
11
- # =============================================================================
12
-
13
- # --- Build Stage ---
14
- FROM python:3.12-slim AS builder
15
-
16
- WORKDIR /build
17
-
18
- # Pin pip and install dependencies
19
- COPY requirements.txt .
20
- RUN pip install --no-cache-dir --require-hashes -r requirements.txt
21
-
22
- # Copy application
23
- COPY src/ /build/src/
24
- COPY pyproject.toml /build/
25
-
26
- # Build wheel
27
- RUN pip wheel --no-cache-dir --no-deps -w /build/wheels .
28
-
29
- # --- Runtime Stage ---
30
- FROM gcr.io/distroless/python3-debian12:nonroot AS runtime
31
-
32
- # Copy wheels from builder
33
- COPY --from=builder /build/wheels /app/wheels/
34
- COPY --from=builder /build/src/ /app/src/
35
-
36
- # Set environment
37
- ENV PYTHONUNBUFFERED=1 \
38
- PYTHONDONTWRITEBYTECODE=1 \
39
- PATH="/app/.local/bin:${PATH}"
40
-
41
- WORKDIR /app
42
-
43
- # Run as non-root (distroless nonroot image UID 65532)
44
- USER 65532:65532
45
-
46
- EXPOSE 8080
47
-
48
- HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
49
- CMD ["python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/healthz')"]
50
-
51
- ENTRYPOINT ["python", "-m", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8080"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/base-images/Dockerfile.frontend DELETED
@@ -1,33 +0,0 @@
1
- # =============================================================================
2
- # Multi-Stage Hardened Dockerfile β€” React Frontend
3
- # =============================================================================
4
-
5
- # --- Build Stage ---
6
- FROM node:20-alpine AS builder
7
-
8
- WORKDIR /app
9
-
10
- # Pin package versions with lockfile
11
- COPY package.json package-lock.json ./
12
- RUN npm ci --ignore-scripts
13
-
14
- COPY . .
15
- RUN npm run build
16
-
17
- # --- Runtime Stage ---
18
- FROM nginxinc/nginx-unprivileged:1.25-alpine AS runtime
19
-
20
- # Remove default nginx configs
21
- RUN rm -f /etc/nginx/conf.d/default.conf
22
-
23
- # Copy custom nginx config (security headers)
24
- COPY docker/nginx.conf /etc/nginx/conf.d/
25
- COPY --from=builder /app/dist /usr/share/nginx/html
26
-
27
- # Security headers are in nginx.conf
28
- EXPOSE 8080
29
-
30
- USER 101:101
31
-
32
- HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
33
- CMD ["curl", "-f", "http://localhost:8080/healthz"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/cost-optimization.yaml DELETED
@@ -1,73 +0,0 @@
1
- # =============================================================================
2
- # FinOps Engine β€” Cloud Cost Governance
3
- # =============================================================================
4
- # Addresses: cost waste, rightsizing, scheduling, unit economics
5
- # =============================================================================
6
-
7
- # --- Spot Instance Strategy ---
8
- # Use SPOT for ML training workloads (70-90% cost savings)
9
- # Use ON_DEMAND for production services (no interruption risk)
10
-
11
- apiVersion: apps/v1
12
- kind: Deployment
13
- metadata:
14
- name: ml-training-spot
15
- namespace: ml-pipeline
16
- labels:
17
- app: ml-training-spot
18
- finops: spot-instance
19
- spec:
20
- replicas: 0 # Scale up on demand via KEDA
21
- selector:
22
- matchLabels:
23
- app: ml-training-spot
24
- template:
25
- metadata:
26
- labels:
27
- app: ml-training-spot
28
- finops: spot-instance
29
- spec:
30
- containers:
31
- - name: trainer
32
- image: "ecr.aws/devsecops/ml-train:v1.0.0"
33
- resources:
34
- requests:
35
- cpu: "4"
36
- memory: 16Gi
37
- nvidia.com/gpu: "1"
38
- limits:
39
- cpu: "8"
40
- memory: 32Gi
41
- nvidia.com/gpu: "1"
42
- tolerations:
43
- - key: nvidia.com/gpu
44
- operator: Exists
45
- effect: NoSchedule
46
- nodeSelector:
47
- workload: ml-spot
48
- # Allow eviction for spot reclamation
49
- terminationGracePeriodSeconds: 120
50
- ---
51
- # --- KEDA Scaler β€” Scale ML training on queue depth ---
52
- apiVersion: keda.sh/v1alpha1
53
- kind: ScaledJob
54
- metadata:
55
- name: ml-training-scaler
56
- namespace: ml-pipeline
57
- spec:
58
- minReplicaCount: 0
59
- maxReplicaCount: 4
60
- pollingInterval: 30
61
- triggers:
62
- - type: aws-sqs
63
- metadata:
64
- queueURL: https://sqs.us-east-1.amazonaws.com/123456789012/ml-training-queue
65
- queueLength: "1"
66
- jobTemplate:
67
- spec:
68
- template:
69
- spec:
70
- restartPolicy: Never
71
- containers:
72
- - name: trainer
73
- image: "ecr.aws/devsecops/ml-train:v1.0.0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/finops-cronjob.yaml DELETED
@@ -1,23 +0,0 @@
1
- # FinOps Daily Cost Scanner
2
- apiVersion: batch/v1
3
- kind: CronJob
4
- metadata:
5
- name: finops-scanner
6
- namespace: platform-system
7
- spec:
8
- schedule: "0 6 * * 1-5" # 6am weekdays
9
- jobTemplate:
10
- spec:
11
- template:
12
- spec:
13
- serviceAccountName: finops-scanner
14
- containers:
15
- - name: scanner
16
- image: "ecr.aws/devsecops/finops-scanner:latest"
17
- command: ["python3", "finops_scanner.py"]
18
- env:
19
- - name: AWS_REGION
20
- value: "us-east-1"
21
- restartPolicy: Never
22
- concurrencyPolicy: Forbid
23
- successfulJobsHistoryLimit: 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finops/finops-policy.yaml DELETED
@@ -1,73 +0,0 @@
1
- # =============================================================================
2
- # FinOps Policy β€” Cloud Waste Detection & Rightsizing Rules
3
- # =============================================================================
4
-
5
- policies:
6
- # --- Unused Resources ---
7
- - id: FINOPS-001
8
- name: "Detect unused EBS volumes"
9
- severity: medium
10
- check: "aws ec2 describe-volumes --filters Name=status,Values=available"
11
- action: "Create snapshot, delete volume after 7 days"
12
- estimated_savings: "$50-200/month per volume"
13
-
14
- - id: FINOPS-002
15
- name: "Detect idle RDS instances"
16
- severity: medium
17
- check: "Connection count < 5 for 7 days"
18
- action: "Downsize instance class or stop non-prod"
19
- estimated_savings: "30-50% of RDS cost"
20
-
21
- - id: FINOPS-003
22
- name: "Detect unattached EIPs"
23
- severity: low
24
- check: "aws ec2 describe-addresses --filter Name=association-id,Values=''"
25
- action: "Release EIP"
26
- estimated_savings: "$3.60/month per EIP"
27
-
28
- # --- Rightsizing ---
29
- - id: FINOPS-010
30
- name: "EC2 rightsizing recommendations"
31
- severity: medium
32
- check: "CPU < 15% for 14 days OR Memory < 25% for 14 days"
33
- action: "Downsize to next tier (e.g., m6i.xlarge -> m6i.large)"
34
- estimated_savings: "20-40% per instance"
35
-
36
- - id: FINOPS-011
37
- name: "Over-provisioned K8s workloads"
38
- severity: medium
39
- check: "Container CPU request > 2x actual P95 usage"
40
- action: "Reduce requests to P95 + 20% headroom"
41
- estimated_savings: "30-50% of cluster cost"
42
-
43
- # --- Scheduling ---
44
- - id: FINOPS-020
45
- name: "Non-prod environment scheduling"
46
- severity: high
47
- check: "Dev/staging workloads running 24/7"
48
- action: "Scale to 0 outside business hours (Mon-Fri 8am-8pm)"
49
- estimated_savings: "65% of non-prod compute"
50
-
51
- # --- Reserved Instances Coverage ---
52
- - id: FINOPS-030
53
- name: "RI coverage below 70%"
54
- severity: high
55
- check: "RI coverage < 70% for consistent workloads"
56
- action: "Purchase RIs for EKS node groups + RDS"
57
- estimated_savings: "30-40% vs on-demand"
58
-
59
- # --- Storage Tiering ---
60
- - id: FINOPS-040
61
- name: "S3 intelligent tiering"
62
- severity: medium
63
- check: "S3 objects > 90 days in STANDARD"
64
- action: "Enable S3 Intelligent-Tiering on all buckets"
65
- estimated_savings: "40-60% on infrequent access data"
66
-
67
- # --- GPU Utilization ---
68
- - id: FINOPS-050
69
- name: "Underutilized GPU nodes"
70
- severity: critical
71
- check: "GPU utilization < 30% for 4 hours"
72
- action: "Scale down GPU node group or use KEDA for demand-based scaling"
73
- estimated_savings: "$2-6/hour per GPU"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
incident-response/auto-remediation/auto-remediate.sh DELETED
@@ -1,50 +0,0 @@
1
- #!/usr/bin/env bash
2
- # =============================================================================
3
- # Autonomous Incident Remediation Engine
4
- # =============================================================================
5
- # Triggered by Alertmanager webhook. Auto-remediates known patterns.
6
- # =============================================================================
7
-
8
- set -euo pipefail
9
-
10
- ALERT_NAME="${1:-unknown}"
11
- NAMESPACE="${2:-default}"
12
- POD_NAME="${3:-}"
13
-
14
- log() { echo "[$(date +%H:%M:%S)] [REMEDIATE] $*"; }
15
-
16
- case "${ALERT_NAME}" in
17
- PodCrashLooping)
18
- log "Remediating crash-looping pod: ${NAMESPACE}/${POD_NAME}"
19
- # Check if OOM killed
20
- OOM_COUNT=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o json | jq -r '.status.containerStatuses[0].lastState.terminated.reason // empty' | grep -c OOMKilled || true)
21
- if [[ "${OOM_COUNT}" -gt 0 ]]; then
22
- log "OOM detected - increasing memory limit"
23
- kubectl patch deployment "${POD_NAME%-*}" -n "${NAMESPACE}" -p '{"spec":{"template":{"spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"2Gi"}}}]}}}}'
24
- log "Memory limit increased to 2Gi"
25
- else
26
- log "Non-OOM crash - restarting pod"
27
- kubectl delete pod "${POD_NAME}" -n "${NAMESPACE}" --grace-period=30
28
- fi
29
- ;;
30
-
31
- HighMemoryUsage)
32
- log "Node memory pressure detected"
33
- # Evict lowest-priority pods
34
- kubectl get pods -A --sort-by=.spec.priority --field-selector=status.phase=Running | tail -5 | while read ns pod rest; do
35
- log "Considering eviction: ${ns}/${pod}"
36
- done
37
- ;;
38
-
39
- FalcoRuntimeAlert)
40
- log "Runtime security alert - do NOT auto-remediate"
41
- log "Escalate to security team: #security-alerts"
42
- # Only notify - never auto-remediate security alerts
43
- ;;
44
-
45
- *)
46
- log "Unknown alert pattern: ${ALERT_NAME}"
47
- log "Manual investigation required"
48
- exit 1
49
- ;;
50
- esac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
platform/adr/template.md β†’ infrastructure/adr-template.md RENAMED
File without changes
{ci-cd/github-actions β†’ infrastructure/ci-cd}/devsecops-pipeline.yml RENAMED
File without changes
{finops β†’ infrastructure}/finops_scanner.py RENAMED
File without changes
{scripts/bash β†’ infrastructure}/incident-response.sh RENAMED
File without changes
{monitoring β†’ infrastructure/monitoring}/alertmanager/alertmanager-config.yaml RENAMED
File without changes
monitoring/grafana/dashboards/platform-overview.json β†’ infrastructure/monitoring/grafana-platform-overview.json RENAMED
File without changes
{monitoring β†’ infrastructure/monitoring}/otel/otel-collector.yaml RENAMED
File without changes
{monitoring β†’ infrastructure/monitoring}/prometheus/alerts.yaml RENAMED
File without changes
incident-response/postmortem/template.md β†’ infrastructure/postmortem-template.md RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/environments/prod/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/eks/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/eks/outputs.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/eks/variables.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/guardduty/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/guardduty/variables.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/iam/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/kms/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/macie/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/rds/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/rds/variables.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/s3/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/s3/variables.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/vpc/main.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/vpc/outputs.tf RENAMED
File without changes
{terraform β†’ infrastructure/terraform}/modules/vpc/variables.tf RENAMED
File without changes
k8s/base/limit-ranges/limit-ranges.yaml DELETED
@@ -1,74 +0,0 @@
1
- # =============================================================================
2
- # Limit Ranges β€” Default Resource Requests/Limits Per Container
3
- # =============================================================================
4
-
5
- apiVersion: v1
6
- kind: LimitRange
7
- metadata:
8
- name: default-limits
9
- namespace: frontend
10
- spec:
11
- limits:
12
- - type: Container
13
- default:
14
- cpu: 500m
15
- memory: 256Mi
16
- defaultRequest:
17
- cpu: 100m
18
- memory: 128Mi
19
- max:
20
- cpu: "2"
21
- memory: 2Gi
22
- min:
23
- cpu: 50m
24
- memory: 64Mi
25
- maxLimitRequestRatio:
26
- cpu: "4"
27
- memory: "4"
28
- ---
29
- apiVersion: v1
30
- kind: LimitRange
31
- metadata:
32
- name: default-limits
33
- namespace: backend
34
- spec:
35
- limits:
36
- - type: Container
37
- default:
38
- cpu: "1"
39
- memory: 512Mi
40
- defaultRequest:
41
- cpu: 200m
42
- memory: 256Mi
43
- max:
44
- cpu: "4"
45
- memory: 4Gi
46
- min:
47
- cpu: 100m
48
- memory: 128Mi
49
- maxLimitRequestRatio:
50
- cpu: "4"
51
- memory: "4"
52
- ---
53
- apiVersion: v1
54
- kind: LimitRange
55
- metadata:
56
- name: default-limits
57
- namespace: ml-pipeline
58
- spec:
59
- limits:
60
- - type: Container
61
- default:
62
- cpu: "2"
63
- memory: 4Gi
64
- nvidia.com/gpu: "1"
65
- defaultRequest:
66
- cpu: 500m
67
- memory: 1Gi
68
- max:
69
- cpu: "8"
70
- memory: 16Gi
71
- nvidia.com/gpu: "2"
72
- min:
73
- cpu: 200m
74
- memory: 512Mi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/namespaces/namespaces.yaml DELETED
@@ -1,69 +0,0 @@
1
- # =============================================================================
2
- # Namespace Definitions β€” Security-First Multi-Tenant Layout
3
- # =============================================================================
4
- # Each namespace gets:
5
- # - Labels for network policy targeting
6
- # - Resource quotas
7
- # - Limit ranges
8
- # - Pod security standards via labels (Kyverno enforces)
9
-
10
- apiVersion: v1
11
- kind: Namespace
12
- metadata:
13
- name: platform-system
14
- labels:
15
- pod-security.kubernetes.io/enforce: "privileged"
16
- pod-security.kubernetes.io/audit: "privileged"
17
- pod-security.kubernetes.io/warn: "privileged"
18
- platform: "true"
19
- ---
20
- apiVersion: v1
21
- kind: Namespace
22
- metadata:
23
- name: monitoring
24
- labels:
25
- pod-security.kubernetes.io/enforce: "restricted"
26
- pod-security.kubernetes.io/audit: "restricted"
27
- pod-security.kubernetes.io/warn: "restricted"
28
- platform: "true"
29
- ---
30
- apiVersion: v1
31
- kind: Namespace
32
- metadata:
33
- name: security
34
- labels:
35
- pod-security.kubernetes.io/enforce: "restricted"
36
- pod-security.kubernetes.io/audit: "restricted"
37
- pod-security.kubernetes.io/warn: "restricted"
38
- platform: "true"
39
- ---
40
- apiVersion: v1
41
- kind: Namespace
42
- metadata:
43
- name: frontend
44
- labels:
45
- pod-security.kubernetes.io/enforce: "restricted"
46
- pod-security.kubernetes.io/audit: "restricted"
47
- pod-security.kubernetes.io/warn: "restricted"
48
- app-team: "frontend"
49
- ---
50
- apiVersion: v1
51
- kind: Namespace
52
- metadata:
53
- name: backend
54
- labels:
55
- pod-security.kubernetes.io/enforce: "restricted"
56
- pod-security.kubernetes.io/audit: "restricted"
57
- pod-security.kubernetes.io/warn: "restricted"
58
- app-team: "backend"
59
- ---
60
- apiVersion: v1
61
- kind: Namespace
62
- metadata:
63
- name: ml-pipeline
64
- labels:
65
- pod-security.kubernetes.io/enforce: "baseline"
66
- pod-security.kubernetes.io/audit: "restricted"
67
- pod-security.kubernetes.io/warn: "restricted"
68
- app-team: "ml"
69
- nvidia.com/gpu: "true"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/network-policies/network-policies.yaml DELETED
@@ -1,124 +0,0 @@
1
- # =============================================================================
2
- # Network Policies β€” Zero Trust Default Deny + Selective Allow
3
- # =============================================================================
4
- # Strategy: Default deny all ingress/egress, then allow only known traffic
5
-
6
- # --- Default Deny All Ingress in Every Namespace ---
7
- apiVersion: networking.k8s.io/v1
8
- kind: NetworkPolicy
9
- metadata:
10
- name: default-deny-ingress
11
- namespace: frontend
12
- spec:
13
- podSelector: {} # Matches all pods
14
- policyTypes:
15
- - Ingress
16
- ---
17
- apiVersion: networking.k8s.io/v1
18
- kind: NetworkPolicy
19
- metadata:
20
- name: default-deny-ingress
21
- namespace: backend
22
- spec:
23
- podSelector: {}
24
- policyTypes:
25
- - Ingress
26
- ---
27
- apiVersion: networking.k8s.io/v1
28
- kind: NetworkPolicy
29
- metadata:
30
- name: default-deny-ingress
31
- namespace: ml-pipeline
32
- spec:
33
- podSelector: {}
34
- policyTypes:
35
- - Ingress
36
- ---
37
- # --- Frontend: Allow ingress from Istio ingress gateway only ---
38
- apiVersion: networking.k8s.io/v1
39
- kind: NetworkPolicy
40
- metadata:
41
- name: allow-istio-ingress
42
- namespace: frontend
43
- spec:
44
- podSelector:
45
- matchLabels:
46
- app: frontend
47
- policyTypes:
48
- - Ingress
49
- ingress:
50
- - from:
51
- - namespaceSelector:
52
- matchLabels:
53
- name: istio-system
54
- - podSelector:
55
- matchLabels:
56
- istio: ingressgateway
57
- ports:
58
- - port: 8080
59
- protocol: TCP
60
- ---
61
- # --- Backend: Allow ingress from frontend namespace only ---
62
- apiVersion: networking.k8s.io/v1
63
- kind: NetworkPolicy
64
- metadata:
65
- name: allow-from-frontend
66
- namespace: backend
67
- spec:
68
- podSelector:
69
- matchLabels:
70
- app: backend
71
- policyTypes:
72
- - Ingress
73
- - Egress
74
- ingress:
75
- - from:
76
- - namespaceSelector:
77
- matchLabels:
78
- app-team: frontend
79
- ports:
80
- - port: 8080
81
- protocol: TCP
82
- egress:
83
- # Allow DNS
84
- - to: []
85
- ports:
86
- - port: 53
87
- protocol: UDP
88
- - port: 53
89
- protocol: TCP
90
- # Allow RDS
91
- - to: []
92
- ports:
93
- - port: 5432
94
- protocol: TCP
95
- ---
96
- # --- ML Pipeline: Allow from backend + Istio ---
97
- apiVersion: networking.k8s.io/v1
98
- kind: NetworkPolicy
99
- metadata:
100
- name: allow-ml-traffic
101
- namespace: ml-pipeline
102
- spec:
103
- podSelector: {}
104
- policyTypes:
105
- - Ingress
106
- - Egress
107
- ingress:
108
- - from:
109
- - namespaceSelector:
110
- matchLabels:
111
- app-team: backend
112
- - from:
113
- - namespaceSelector:
114
- matchLabels:
115
- name: istio-system
116
- egress:
117
- - to: []
118
- ports:
119
- - port: 53
120
- protocol: UDP
121
- - to: []
122
- ports:
123
- - port: 443
124
- protocol: TCP # HuggingFace Hub, S3, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/pdbs/pdbs.yaml DELETED
@@ -1,62 +0,0 @@
1
- # =============================================================================
2
- # Pod Disruption Budgets β€” Availability Guarantees Per Workload
3
- # =============================================================================
4
- # PDBs prevent voluntary evictions (upgrades, drain) from taking down too many pods.
5
- # Without PDBs: kubectl drain or cluster-autoscaler can cause unplanned outages.
6
- # =============================================================================
7
-
8
- apiVersion: policy/v1
9
- kind: PodDisruptionBudget
10
- metadata:
11
- name: frontend-pdb
12
- namespace: frontend
13
- spec:
14
- minAvailable: 2 # At least 2 pods always running (3 replicas total)
15
- selector:
16
- matchLabels:
17
- app: frontend
18
- ---
19
- apiVersion: policy/v1
20
- kind: PodDisruptionBudget
21
- metadata:
22
- name: backend-pdb
23
- namespace: backend
24
- spec:
25
- minAvailable: 2
26
- selector:
27
- matchLabels:
28
- app: backend
29
- ---
30
- apiVersion: policy/v1
31
- kind: PodDisruptionBudget
32
- metadata:
33
- name: ml-inference-pdb
34
- namespace: ml-pipeline
35
- spec:
36
- maxUnavailable: 1 # At most 1 pod disrupted at a time
37
- selector:
38
- matchLabels:
39
- app: ml-inference
40
- ---
41
- # Platform services β€” always keep 1 available
42
- apiVersion: policy/v1
43
- kind: PodDisruptionBudget
44
- metadata:
45
- name: prometheus-pdb
46
- namespace: monitoring
47
- spec:
48
- minAvailable: 1
49
- selector:
50
- matchLabels:
51
- app: kube-prometheus-stack-prometheus
52
- ---
53
- apiVersion: policy/v1
54
- kind: PodDisruptionBudget
55
- metadata:
56
- name: argocd-pdb
57
- namespace: platform-system
58
- spec:
59
- minAvailable: 1
60
- selector:
61
- matchLabels:
62
- app.kubernetes.io/name: argocd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/rbac/rbac.yaml DELETED
@@ -1,78 +0,0 @@
1
- # =============================================================================
2
- # RBAC β€” Least-Privilege Access Control
3
- # =============================================================================
4
-
5
- # Platform Admins β€” Full cluster access
6
- apiVersion: rbac.authorization.k8s.io/v1
7
- kind: ClusterRole
8
- metadata:
9
- name: platform-admin
10
- rules:
11
- - apiGroups: ["*"]
12
- resources: ["*"]
13
- verbs: ["*"]
14
- # Exclude secrets CRUD for audit trail β€” use ExternalSecrets instead
15
- - apiGroups: [""]
16
- resources: ["secrets"]
17
- verbs: ["get", "list", "watch"] # No create/update/delete
18
- ---
19
- apiVersion: rbac.authorization.k8s.io/v1
20
- kind: ClusterRoleBinding
21
- metadata:
22
- name: platform-admin
23
- roleRef:
24
- apiGroup: rbac.authorization.k8s.io
25
- kind: ClusterRole
26
- name: platform-admin
27
- subjects:
28
- - kind: Group
29
- name: platform-admins
30
- apiGroup: rbac.authorization.k8s.io
31
- ---
32
- # Developer β€” Read + Pod Exec + Logs within their namespaces
33
- apiVersion: rbac.authorization.k8s.io/v1
34
- kind: ClusterRole
35
- metadata:
36
- name: developer
37
- rules:
38
- - apiGroups: ["", "apps", "batch", "extensions"]
39
- resources: ["pods", "pods/log", "pods/exec", "deployments", "statefulsets", "jobs", "cronjobs"]
40
- verbs: ["get", "list", "watch"]
41
- - apiGroups: [""]
42
- resources: ["pods/exec"]
43
- verbs: ["create"]
44
- - apiGroups: ["", "apps"]
45
- resources: ["deployments", "statefulsets"]
46
- verbs: ["patch"] # For restart rollout only
47
- - apiGroups: ["metrics.k8s.io"]
48
- resources: ["pods", "nodes"]
49
- verbs: ["get", "list"]
50
- ---
51
- # Viewer β€” Read-only cluster-wide
52
- apiVersion: rbac.authorization.k8s.io/v1
53
- kind: ClusterRole
54
- metadata:
55
- name: viewer
56
- rules:
57
- - apiGroups: ["", "apps", "batch", "extensions", "networking.k8s.io"]
58
- resources: ["*"]
59
- verbs: ["get", "list", "watch"]
60
- - nonResourceURLs: ["*"]
61
- verbs: ["get"]
62
- ---
63
- # ML Engineer β€” Access to ml-pipeline namespace only
64
- apiVersion: rbac.authorization.k8s.io/v1
65
- kind: Role
66
- metadata:
67
- name: ml-engineer
68
- namespace: ml-pipeline
69
- rules:
70
- - apiGroups: ["", "apps", "batch", "kubeflow.org", "serving.kubeflow.org"]
71
- resources: ["pods", "pods/log", "pods/exec", "deployments", "jobs", "notebooks", "inferenceservices"]
72
- verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
73
- - apiGroups: [""]
74
- resources: ["secrets"]
75
- verbs: ["get", "list"] # No create/update
76
- - apiGroups: [""]
77
- resources: ["configmaps"]
78
- verbs: ["get", "list", "create", "update"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/resource-quotas/resource-quotas.yaml DELETED
@@ -1,50 +0,0 @@
1
- # =============================================================================
2
- # Resource Quotas β€” Prevent Resource Exhaustion Per Namespace
3
- # =============================================================================
4
-
5
- apiVersion: v1
6
- kind: ResourceQuota
7
- metadata:
8
- name: frontend-quota
9
- namespace: frontend
10
- spec:
11
- hard:
12
- requests.cpu: "4"
13
- requests.memory: 8Gi
14
- limits.cpu: "8"
15
- limits.memory: 16Gi
16
- pods: "20"
17
- services: "5"
18
- persistentvolumeclaims: "10"
19
- requests.nvidia.com/gpu: "0" # No GPUs for frontend
20
- ---
21
- apiVersion: v1
22
- kind: ResourceQuota
23
- metadata:
24
- name: backend-quota
25
- namespace: backend
26
- spec:
27
- hard:
28
- requests.cpu: "8"
29
- requests.memory: 16Gi
30
- limits.cpu: "16"
31
- limits.memory: 32Gi
32
- pods: "30"
33
- services: "10"
34
- persistentvolumeclaims: "20"
35
- ---
36
- apiVersion: v1
37
- kind: ResourceQuota
38
- metadata:
39
- name: ml-quota
40
- namespace: ml-pipeline
41
- spec:
42
- hard:
43
- requests.cpu: "16"
44
- requests.memory: 64Gi
45
- limits.cpu: "32"
46
- limits.memory: 128Gi
47
- pods: "15"
48
- services: "5"
49
- persistentvolumeclaims: "30"
50
- requests.nvidia.com/gpu: "4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/base/slos/slos.yaml DELETED
@@ -1,68 +0,0 @@
1
- # =============================================================================
2
- # Service Level Objectives β€” Platform SLOs
3
- # =============================================================================
4
- # SLOs define reliability targets. Error budgets = 100% - SLO.
5
- # Burn rate alerts fire when error budget is consumed too fast.
6
- # =============================================================================
7
-
8
- # --- API Availability SLO: 99.95% (21.9 min/month error budget) ---
9
- apiVersion: monitoring.coreos.com/v1
10
- kind: PrometheusRule
11
- metadata:
12
- name: slo-api-availability
13
- namespace: monitoring
14
- labels:
15
- release: kube-prometheus-stack
16
- slo: "true"
17
- spec:
18
- groups:
19
- - name: slo.api.availability
20
- rules:
21
- # SLO metric: 5m success rate
22
- - record: slo:api_availability:rate5m
23
- expr: |
24
- sum(rate(http_requests_total{code!~"5.."}[5m]))
25
- /
26
- sum(rate(http_requests_total[5m]))
27
-
28
- # 1h error budget burn rate (14.4x = consume 30d budget in 2d)
29
- - alert: SLOAPIAvailabilityBurnRateHigh
30
- expr: |
31
- (
32
- (1 - slo:api_availability:rate5m) > (14.4 * 0.001)
33
- )
34
- for: 5m
35
- labels:
36
- severity: critical
37
- slo: api-availability
38
- annotations:
39
- summary: "API availability SLO budget burning too fast"
40
- runbook: "https://runbook.platform.internal/slo-api-burn"
41
-
42
- - name: slo.api.latency
43
- rules:
44
- # Latency SLO: P99 < 2s, 99.9% of requests
45
- - record: slo:api_latency_p99:rate5m
46
- expr: |
47
- histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
48
-
49
- - alert: SLOAPILatencyBurnRateHigh
50
- expr: |
51
- slo:api_latency_p99:rate5m > 2
52
- for: 10m
53
- labels:
54
- severity: warning
55
- slo: api-latency
56
- annotations:
57
- summary: "API P99 latency exceeding 2s SLO"
58
-
59
- - name: slo.error_budget
60
- rules:
61
- # Remaining error budget (percentage)
62
- - record: slo:error_budget_remaining:ratio
63
- expr: |
64
- 1 - (
65
- (1 - slo:api_availability:rate5m)
66
- /
67
- 0.0005
68
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
k8s/kustomize/base/kustomization.yaml DELETED
@@ -1,18 +0,0 @@
1
- apiVersion: kustomize.config.k8s.io/v1beta1
2
- kind: Kustomization
3
- resources:
4
- - ../../base/namespaces/
5
- - ../../base/rbac/
6
- - ../../base/network-policies/
7
- - ../../base/resource-quotas/
8
- - ../../base/limit-ranges/
9
- - ../../base/pdbs/
10
- - ../../base/slos/
11
- - ../../manifests/cert-manager/
12
- - ../../manifests/external-secrets/
13
- - ../../manifests/istio/
14
- - ../../manifests/argo-cd/
15
- - ../../manifests/trivy-operator/
16
- - ../../manifests/falco/
17
- - ../../manifests/kyverno/
18
- - ../../manifests/prometheus-stack/