test / helm /h2ogpt-chart /templates /deployment.yaml
iblfe's picture
Upload folder using huggingface_hub
b585c7f verified
raw
history blame
22.2 kB
{{- if and .Values.vllm.enabled .Values.tgi.enabled }}
{{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
{{- if .Values.h2ogpt.stack.enabled }}
{{- if not (and .Values.vllm.enabled .Values.h2ogpt.enabled) }}
{{- fail "If h2oGPT stack is enabled, both vLLM and h2oGPT should be enabled" }}
{{- end }}
{{- end }}
---
{{- if .Values.h2ogpt.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}
labels:
app: {{ include "h2ogpt.fullname" . }}
spec:
{{- if not .Values.h2ogpt.autoscaling.enabled }}
replicas: {{ .Values.h2ogpt.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}
{{- if .Values.h2ogpt.updateStrategy }}
strategy: {{- toYaml .Values.h2ogpt.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.h2ogpt.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}
spec:
{{- with .Values.h2ogpt.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.h2ogpt.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.h2ogpt.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
{{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }}
initContainers:
- name: tgi-check
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}"
imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}}
command: ["/bin/sh", "-c"]
args:
- >
until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1;
do
echo "Waiting for inference service to become ready...";
sleep 5;
done
{{- end }}
{{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled ) }}
initContainers:
- name: vllm-check
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}"
imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}}
command: ["/bin/sh", "-c"]
args:
- >
until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1;
do
echo "Waiting for inference service to become ready...";
sleep 5;
done
{{- end }}
{{- with .Values.h2ogpt.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
{{- if .Values.h2ogpt.stack.enabled }}
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference
securityContext:
{{- toYaml .Values.vllm.securityContext | nindent 12 }}
image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
args:
- "-m"
- "vllm.entrypoints.openai.api_server"
- "--port"
- "5000"
- "--host"
- "0.0.0.0"
- "--download-dir"
- "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
- "{{ $arg }}"
{{- end }}
ports:
- name: http
containerPort: 5000
protocol: TCP
{{- if .Values.vllm.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.vllm.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.vllm.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
env:
- name: NCCL_IGNORE_DISABLED_P2P
value: "1"
{{- range $key, $value := .Values.vllm.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
volumeMounts:
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
mountPath: /workspace/.cache
subPath: cache
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
mountPath: /dev/shm
subPath: shm
{{- end }}
- name: {{ include "h2ogpt.fullname" . }}
securityContext:
{{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }}
command: ["/bin/bash", "-c"]
{{- if .Values.h2ogpt.stack.enabled }}
args:
- >
while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}''
http://localhost:5000/v1/models)" != "200" ]]; do
echo "Waiting for inference service to become ready... (2sec)"
sleep 2
done
python3 /workspace/generate.py
{{- end }}
{{- if not .Values.h2ogpt.stack.enabled }}
args:
- >
python3 /workspace/generate.py
{{- end }}
ports:
- name: http
containerPort: 7860
protocol: TCP
- name: gpt
containerPort: 8888
protocol: TCP
{{- if .Values.h2ogpt.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.h2ogpt.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.h2ogpt.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.h2ogpt.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.h2ogpt.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-config
env:
{{- if and .Values.tgi.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
- name: h2ogpt_inference_server
value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}"
{{- end }}
{{- if and .Values.vllm.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
- name: h2ogpt_inference_server
value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}"
{{- end }}
{{- if and .Values.h2ogpt.stack.enabled (not .Values.h2ogpt.externalLLM.enabled) }}
- name: h2ogpt_inference_server
value: "vllm:localhost:5000"
{{- end }}
{{- range $key, $value := .Values.h2ogpt.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if and .Values.h2ogpt.externalLLM.openAIAzure.enabled .Values.h2ogpt.externalLLM.enabled }}
- name: OPENAI_AZURE_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.h2ogpt.externalLLM.secret }}
key: OPENAI_AZURE_KEY
- name: OPENAI_AZURE_API_BASE
valueFrom:
secretKeyRef:
name: {{ .Values.h2ogpt.externalLLM.secret }}
key: OPENAI_AZURE_API_BASE
{{- end }}
{{- if and .Values.h2ogpt.externalLLM.openAI.enabled .Values.h2ogpt.externalLLM.enabled }}
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.h2ogpt.externalLLM.secret }}
key: OPENAI_API_KEY
{{- end }}
{{- if and .Values.h2ogpt.externalLLM.replicate.enabled .Values.h2ogpt.externalLLM.enabled }}
- name: REPLICATE_API_TOKEN
valueFrom:
secretKeyRef:
name: {{ .Values.h2ogpt.externalLLM.secret }}
key: REPLICATE_API_TOKEN
{{- end }}
{{- if .Values.h2ogpt.externalLLM.enabled }}
- name: H2OGPT_MODEL_LOCK
value: {{ toJson .Values.h2ogpt.externalLLM.modelLock | quote }}
- name: H2OGPT_SCORE_MODEL
value: None
{{- end }}
volumeMounts:
- name: {{ include "h2ogpt.fullname" . }}-volume
mountPath: /workspace/.cache
subPath: cache
- name: {{ include "h2ogpt.fullname" . }}-volume
mountPath: /workspace/save
subPath: save
volumes:
- name: {{ include "h2ogpt.fullname" . }}-volume
{{- if not .Values.h2ogpt.storage.useEphemeral }}
persistentVolumeClaim:
claimName: {{ include "h2ogpt.fullname" . }}-volume
{{- else}}
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.h2ogpt.storage.size | quote }}
storageClassName: {{ .Values.h2ogpt.storage.class }}
{{- end }}
{{- if .Values.h2ogpt.stack.enabled }}
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- if not .Values.vllm.storage.useEphemeral }}
persistentVolumeClaim:
claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- else }}
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.vllm.storage.size | quote }}
storageClassName: {{ .Values.vllm.storage.class }}
{{- end }}
{{- end }}
{{- end }}
---
{{- if and (.Values.h2ogpt.enabled) (not .Values.h2ogpt.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "h2ogpt.fullname" . }}-volume
spec:
accessModes:
- ReadWriteOnce
# storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
storageClassName: {{ .Values.h2ogpt.storage.class }}
resources:
requests:
storage: {{ .Values.h2ogpt.storage.size | quote }}
{{- end }}
---
{{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}-tgi-inference
labels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
{{- if not .Values.tgi.autoscaling.enabled }}
replicas: {{ .Values.tgi.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
{{- if .Values.tgi.updateStrategy }}
strategy: {{- toYaml .Values.tgi.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.tgi.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
{{- with .Values.tgi.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tgi.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.tgi.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
{{- with .Values.tgi.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference
securityContext:
{{- toYaml .Values.tgi.securityContext | nindent 12 }}
image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}"
imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
command: []
args:
{{- range $arg := .Values.tgi.containerArgs }}
- "{{ $arg }}"
{{- end }}
ports:
- name: http
containerPort: 80
protocol: TCP
{{- if .Values.tgi.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.tgi.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.tgi.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.tgi.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.tgi.resources | nindent 12 }}
env:
{{- range $key, $value := .Values.tgi.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config
- secretRef:
name: {{ .Values.tgi.hfSecret }}
volumeMounts:
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
mountPath: /app/cache
subPath: cache
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
mountPath: /data
subPath: data
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
mountPath: /dev/shm
subPath: shm
volumes:
{{- if .Values.h2ogpt.stack.enabled }}
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- if not .Values.vllm.storage.useEphemeral }}
persistentVolumeClaim:
claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- else }}
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.vllm.storage.size | quote }}
storageClassName: {{ .Values.vllm.storage.class }}
{{- end }}
{{- end }}
- name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
{{- if not .Values.tgi.storage.useEphemeral}}
persistentVolumeClaim:
claimName: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
{{- else}}
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.tgi.storage.size | quote }}
storageClassName: {{ .Values.tgi.storage.class }}
{{- end }}
{{- end }}
---
{{- if and (.Values.tgi.enabled) (not .Values.tgi.storage.useEphemeral)}}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
spec:
accessModes:
- ReadWriteOnce
# storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
storageClassName: {{ .Values.tgi.storage.class }}
resources:
requests:
storage: {{ .Values.tgi.storage.size | quote }}
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled )}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference
labels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
{{- if not .Values.vllm.autoscaling.enabled }}
replicas: {{ .Values.vllm.replicaCount }}
{{- end }}
selector:
matchLabels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
{{- if .Values.vllm.updateStrategy }}
strategy: {{- toYaml .Values.vllm.updateStrategy | nindent 4 }}
{{- end }}
template:
metadata:
{{- with .Values.vllm.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
{{- with .Values.vllm.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.vllm.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{ include "h2ogpt.fullname" . }}
topologyKey: failure-domain.beta.kubernetes.io/zone
{{- with .Values.vllm.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference
securityContext:
{{- toYaml .Values.vllm.securityContext | nindent 12 }}
image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
args:
- "-m"
- "vllm.entrypoints.openai.api_server"
- "--port"
- "5000"
- "--host"
- "0.0.0.0"
- "--download-dir"
- "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
- "{{ $arg }}"
{{- end }}
ports:
- name: http
containerPort: 5000
protocol: TCP
{{- if .Values.vllm.livenessProbe }}
livenessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
{{- end }}
{{- if .Values.vllm.readinessProbe }}
readinessProbe:
httpGet:
path: /
scheme: HTTP
port: http
{{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.vllm.resources | nindent 12 }}
envFrom:
- configMapRef:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
env:
- name: NCCL_IGNORE_DISABLED_P2P
value: "1"
{{- range $key, $value := .Values.vllm.env }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
volumeMounts:
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
mountPath: /workspace/.cache
subPath: cache
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
mountPath: /dev/shm
subPath: shm
volumes:
- name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- if not .Values.vllm.storage.useEphemeral }}
persistentVolumeClaim:
claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
{{- else }}
ephemeral:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.vllm.storage.size | quote }}
storageClassName: {{ .Values.vllm.storage.class }}
{{- end }}
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.vllm.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
spec:
accessModes:
- ReadWriteOnce
# storageClassName: {{ .Values.vllm.storage.class | quote }}
storageClassName: {{ .Values.vllm.storage.class }}
resources:
requests:
storage: {{ .Values.vllm.storage.size | quote }}
{{- end }}