devsecops-platform / deployment /deployment.yaml
shaikhsalman's picture
refactor: merged structure - model at center, DevSecOps wrapped around it
9d4d5c7 verified
# =============================================================================
# ML Pipeline — Training Job + Inference Service
# =============================================================================
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-inference
namespace: ml-pipeline
labels:
app: ml-inference
version: v1
spec:
replicas: 1
selector:
matchLabels:
app: ml-inference
template:
metadata:
labels:
app: ml-inference
version: v1
annotations:
sidecar.istio.io/inject: "true"
spec:
serviceAccountName: ml-inference
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: inference
image: "ecr.aws/devsecops/ml-inference:v1.0.0"
ports:
- containerPort: 8000
protocol: TCP
env:
- name: MODEL_PATH
value: "/models/latest"
- name: HF_HOME
value: "/cache/huggingface"
resources:
requests:
cpu: "2"
memory: 4Gi
nvidia.com/gpu: "1"
limits:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 10
periodSeconds: 10
volumeMounts:
- name: model-storage
mountPath: /models
- name: huggingface-cache
mountPath: /cache/huggingface
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: model-pvc
- name: huggingface-cache
emptyDir:
medium: Memory
sizeLimit: 1Gi
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
nodeSelector:
workload: ml
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-pvc
namespace: ml-pipeline
spec:
accessModes:
- ReadWriteOnce
storageClassName: gp3-encrypted
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: Service
metadata:
name: ml-inference
namespace: ml-pipeline
spec:
selector:
app: ml-inference
ports:
- port: 8000
targetPort: 8000
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: ml-inference
namespace: ml-pipeline
---
# ML Training Job Template
apiVersion: batch/v1
kind: Job
metadata:
name: ml-train-{{ .JobID }}
namespace: ml-pipeline
spec:
backoffLimit: 2
ttlSecondsAfterFinished: 86400 # Clean up after 24h
template:
spec:
serviceAccountName: ml-train
securityContext:
runAsNonRoot: true
runAsUser: 1000
containers:
- name: trainer
image: "ecr.aws/devsecops/ml-train:v1.0.0"
command: ["python", "train.py"]
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-credentials
key: token
- name: TRACKIO_URL
value: "https://trackio.platform.internal"
resources:
requests:
cpu: "4"
memory: 16Gi
nvidia.com/gpu: "1"
limits:
cpu: "8"
memory: 32Gi
nvidia.com/gpu: "1"
volumeMounts:
- name: training-data
mountPath: /data
- name: model-output
mountPath: /output
volumes:
- name: training-data
persistentVolumeClaim:
claimName: training-data-pvc
- name: model-output
persistentVolumeClaim:
claimName: model-output-pvc
restartPolicy: Never
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule