{{- if and .Values.vllm.enabled .Values.tgi.enabled }} {{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }} {{- end }} {{- if .Values.h2ogpt.stack.enabled }} {{- if not (and .Values.vllm.enabled .Values.h2ogpt.enabled) }} {{- fail "If h2oGPT stack is enabled, both vLLM and h2oGPT should be enabled" }} {{- end }} {{- end }} --- {{- if .Values.h2ogpt.enabled }} apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "h2ogpt.fullname" . }} labels: app: {{ include "h2ogpt.fullname" . }} spec: {{- if not .Values.h2ogpt.autoscaling.enabled }} replicas: {{ .Values.h2ogpt.replicaCount }} {{- end }} selector: matchLabels: app: {{ include "h2ogpt.fullname" . }} {{- if .Values.h2ogpt.updateStrategy }} strategy: {{- toYaml .Values.h2ogpt.updateStrategy | nindent 4 }} {{- end }} template: metadata: {{- with .Values.h2ogpt.podAnnotations }} annotations: {{- toYaml . | nindent 8 }} {{- end }} labels: app: {{ include "h2ogpt.fullname" . }} spec: {{- with .Values.h2ogpt.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.h2ogpt.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} securityContext: {{- toYaml .Values.h2ogpt.podSecurityContext | nindent 8 }} affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - {{ include "h2ogpt.fullname" . }} topologyKey: failure-domain.beta.kubernetes.io/zone {{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }} initContainers: - name: tgi-check securityContext: {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }} image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}" imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}} command: ["/bin/sh", "-c"] args: - > until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1; do echo "Waiting for inference service to become ready..."; sleep 5; done {{- end }} {{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled ) }} initContainers: - name: vllm-check securityContext: {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }} image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}" imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}} command: ["/bin/sh", "-c"] args: - > until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1; do echo "Waiting for inference service to become ready..."; sleep 5; done {{- end }} {{- with .Values.h2ogpt.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} containers: {{- if .Values.h2ogpt.stack.enabled }} - name: {{ include "h2ogpt.fullname" . }}-vllm-inference securityContext: {{- toYaml .Values.vllm.securityContext | nindent 12 }} image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.vllm.image.pullPolicy }} command: ["/h2ogpt_conda/vllm_env/bin/python3.10"] args: - "-m" - "vllm.entrypoints.openai.api_server" - "--port" - "5000" - "--host" - "0.0.0.0" - "--download-dir" - "/workspace/.cache/huggingface/hub" {{- range $arg := .Values.vllm.containerArgs }} - "{{ $arg }}" {{- end }} ports: - name: http containerPort: 5000 protocol: TCP {{- if .Values.vllm.livenessProbe }} livenessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.vllm.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.vllm.readinessProbe }} readinessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.vllm.readinessProbe | nindent 12 }} {{- end }} resources: {{- toYaml .Values.vllm.resources | nindent 12 }} envFrom: - configMapRef: name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config env: - name: NCCL_IGNORE_DISABLED_P2P value: "1" {{- range $key, $value := .Values.vllm.env }} - name: "{{ $key }}" value: "{{ $value }}" {{- end }} volumeMounts: - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume mountPath: /workspace/.cache subPath: cache - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume mountPath: /dev/shm subPath: shm {{- end }} - name: {{ include "h2ogpt.fullname" . }} securityContext: {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }} image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }} command: ["/bin/bash", "-c"] {{- if .Values.h2ogpt.stack.enabled }} args: - > while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}'' http://localhost:5000/v1/models)" != "200" ]]; do echo "Waiting for inference service to become ready... (2sec)" sleep 2 done python3 /workspace/generate.py {{- end }} {{- if not .Values.h2ogpt.stack.enabled }} args: - > python3 /workspace/generate.py {{- end }} ports: - name: http containerPort: 7860 protocol: TCP - name: gpt containerPort: 8888 protocol: TCP {{- if .Values.h2ogpt.livenessProbe }} livenessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.h2ogpt.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.h2ogpt.readinessProbe }} readinessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.h2ogpt.readinessProbe | nindent 12 }} {{- end }} resources: {{- toYaml .Values.h2ogpt.resources | nindent 12 }} envFrom: - configMapRef: name: {{ include "h2ogpt.fullname" . }}-config env: {{- if and .Values.tgi.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }} - name: h2ogpt_inference_server value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}" {{- end }} {{- if and .Values.vllm.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }} - name: h2ogpt_inference_server value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}" {{- end }} {{- if and .Values.h2ogpt.stack.enabled (not .Values.h2ogpt.externalLLM.enabled) }} - name: h2ogpt_inference_server value: "vllm:localhost:5000" {{- end }} {{- range $key, $value := .Values.h2ogpt.env }} - name: "{{ $key }}" value: "{{ $value }}" {{- end }} {{- if and .Values.h2ogpt.externalLLM.openAIAzure.enabled .Values.h2ogpt.externalLLM.enabled }} - name: OPENAI_AZURE_KEY valueFrom: secretKeyRef: name: {{ .Values.h2ogpt.externalLLM.secret }} key: OPENAI_AZURE_KEY - name: OPENAI_AZURE_API_BASE valueFrom: secretKeyRef: name: {{ .Values.h2ogpt.externalLLM.secret }} key: OPENAI_AZURE_API_BASE {{- end }} {{- if and .Values.h2ogpt.externalLLM.openAI.enabled .Values.h2ogpt.externalLLM.enabled }} - name: OPENAI_API_KEY valueFrom: secretKeyRef: name: {{ .Values.h2ogpt.externalLLM.secret }} key: OPENAI_API_KEY {{- end }} {{- if and .Values.h2ogpt.externalLLM.replicate.enabled .Values.h2ogpt.externalLLM.enabled }} - name: REPLICATE_API_TOKEN valueFrom: secretKeyRef: name: {{ .Values.h2ogpt.externalLLM.secret }} key: REPLICATE_API_TOKEN {{- end }} {{- if .Values.h2ogpt.externalLLM.enabled }} - name: H2OGPT_MODEL_LOCK value: {{ toJson .Values.h2ogpt.externalLLM.modelLock | quote }} - name: H2OGPT_SCORE_MODEL value: None {{- end }} volumeMounts: - name: {{ include "h2ogpt.fullname" . }}-volume mountPath: /workspace/.cache subPath: cache - name: {{ include "h2ogpt.fullname" . }}-volume mountPath: /workspace/save subPath: save volumes: - name: {{ include "h2ogpt.fullname" . }}-volume {{- if not .Values.h2ogpt.storage.useEphemeral }} persistentVolumeClaim: claimName: {{ include "h2ogpt.fullname" . }}-volume {{- else}} ephemeral: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ .Values.h2ogpt.storage.size | quote }} storageClassName: {{ .Values.h2ogpt.storage.class }} {{- end }} {{- if .Values.h2ogpt.stack.enabled }} - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- if not .Values.vllm.storage.useEphemeral }} persistentVolumeClaim: claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- else }} ephemeral: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ .Values.vllm.storage.size | quote }} storageClassName: {{ .Values.vllm.storage.class }} {{- end }} {{- end }} {{- end }} --- {{- if and (.Values.h2ogpt.enabled) (not .Values.h2ogpt.storage.useEphemeral) }} apiVersion: v1 kind: PersistentVolumeClaim metadata: name: {{ include "h2ogpt.fullname" . }}-volume spec: accessModes: - ReadWriteOnce # storageClassName: {{ .Values.h2ogpt.storage.class | quote }} storageClassName: {{ .Values.h2ogpt.storage.class }} resources: requests: storage: {{ .Values.h2ogpt.storage.size | quote }} {{- end }} --- {{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }} apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "h2ogpt.fullname" . }}-tgi-inference labels: app: {{ include "h2ogpt.fullname" . }}-tgi-inference spec: {{- if not .Values.tgi.autoscaling.enabled }} replicas: {{ .Values.tgi.replicaCount }} {{- end }} selector: matchLabels: app: {{ include "h2ogpt.fullname" . }}-tgi-inference {{- if .Values.tgi.updateStrategy }} strategy: {{- toYaml .Values.tgi.updateStrategy | nindent 4 }} {{- end }} template: metadata: {{- with .Values.tgi.podAnnotations }} annotations: {{- toYaml . | nindent 8 }} {{- end }} labels: app: {{ include "h2ogpt.fullname" . }}-tgi-inference spec: {{- with .Values.tgi.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.tgi.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} securityContext: {{- toYaml .Values.tgi.podSecurityContext | nindent 8 }} affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - {{ include "h2ogpt.fullname" . }} topologyKey: failure-domain.beta.kubernetes.io/zone {{- with .Values.tgi.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} containers: - name: {{ include "h2ogpt.fullname" . }}-tgi-inference securityContext: {{- toYaml .Values.tgi.securityContext | nindent 12 }} image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}" imagePullPolicy: {{ .Values.tgi.image.pullPolicy }} command: [] args: {{- range $arg := .Values.tgi.containerArgs }} - "{{ $arg }}" {{- end }} ports: - name: http containerPort: 80 protocol: TCP {{- if .Values.tgi.livenessProbe }} livenessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.tgi.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.tgi.readinessProbe }} readinessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.tgi.readinessProbe | nindent 12 }} {{- end }} resources: {{- toYaml .Values.tgi.resources | nindent 12 }} env: {{- range $key, $value := .Values.tgi.env }} - name: "{{ $key }}" value: "{{ $value }}" {{- end }} envFrom: - configMapRef: name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config - secretRef: name: {{ .Values.tgi.hfSecret }} volumeMounts: - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume mountPath: /app/cache subPath: cache - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume mountPath: /data subPath: data - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume mountPath: /dev/shm subPath: shm volumes: {{- if .Values.h2ogpt.stack.enabled }} - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- if not .Values.vllm.storage.useEphemeral }} persistentVolumeClaim: claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- else }} ephemeral: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ .Values.vllm.storage.size | quote }} storageClassName: {{ .Values.vllm.storage.class }} {{- end }} {{- end }} - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume {{- if not .Values.tgi.storage.useEphemeral}} persistentVolumeClaim: claimName: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume {{- else}} ephemeral: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ .Values.tgi.storage.size | quote }} storageClassName: {{ .Values.tgi.storage.class }} {{- end }} {{- end }} --- {{- if and (.Values.tgi.enabled) (not .Values.tgi.storage.useEphemeral)}} apiVersion: v1 kind: PersistentVolumeClaim metadata: name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume spec: accessModes: - ReadWriteOnce # storageClassName: {{ .Values.h2ogpt.storage.class | quote }} storageClassName: {{ .Values.tgi.storage.class }} resources: requests: storage: {{ .Values.tgi.storage.size | quote }} {{- end }} --- {{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled )}} apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "h2ogpt.fullname" . }}-vllm-inference labels: app: {{ include "h2ogpt.fullname" . }}-vllm-inference spec: {{- if not .Values.vllm.autoscaling.enabled }} replicas: {{ .Values.vllm.replicaCount }} {{- end }} selector: matchLabels: app: {{ include "h2ogpt.fullname" . }}-vllm-inference {{- if .Values.vllm.updateStrategy }} strategy: {{- toYaml .Values.vllm.updateStrategy | nindent 4 }} {{- end }} template: metadata: {{- with .Values.vllm.podAnnotations }} annotations: {{- toYaml . | nindent 8 }} {{- end }} labels: app: {{ include "h2ogpt.fullname" . }}-vllm-inference spec: {{- with .Values.vllm.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.vllm.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} securityContext: {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }} affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - {{ include "h2ogpt.fullname" . }} topologyKey: failure-domain.beta.kubernetes.io/zone {{- with .Values.vllm.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} containers: - name: {{ include "h2ogpt.fullname" . }}-vllm-inference securityContext: {{- toYaml .Values.vllm.securityContext | nindent 12 }} image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.vllm.image.pullPolicy }} command: ["/h2ogpt_conda/vllm_env/bin/python3.10"] args: - "-m" - "vllm.entrypoints.openai.api_server" - "--port" - "5000" - "--host" - "0.0.0.0" - "--download-dir" - "/workspace/.cache/huggingface/hub" {{- range $arg := .Values.vllm.containerArgs }} - "{{ $arg }}" {{- end }} ports: - name: http containerPort: 5000 protocol: TCP {{- if .Values.vllm.livenessProbe }} livenessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.vllm.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.vllm.readinessProbe }} readinessProbe: httpGet: path: / scheme: HTTP port: http {{- toYaml .Values.vllm.readinessProbe | nindent 12 }} {{- end }} resources: {{- toYaml .Values.vllm.resources | nindent 12 }} envFrom: - configMapRef: name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config env: - name: NCCL_IGNORE_DISABLED_P2P value: "1" {{- range $key, $value := .Values.vllm.env }} - name: "{{ $key }}" value: "{{ $value }}" {{- end }} volumeMounts: - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume mountPath: /workspace/.cache subPath: cache - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume mountPath: /dev/shm subPath: shm volumes: - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- if not .Values.vllm.storage.useEphemeral }} persistentVolumeClaim: claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume {{- else }} ephemeral: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ .Values.vllm.storage.size | quote }} storageClassName: {{ .Values.vllm.storage.class }} {{- end }} {{- end }} --- {{- if and (.Values.vllm.enabled) (not .Values.vllm.storage.useEphemeral) }} apiVersion: v1 kind: PersistentVolumeClaim metadata: name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume spec: accessModes: - ReadWriteOnce # storageClassName: {{ .Values.vllm.storage.class | quote }} storageClassName: {{ .Values.vllm.storage.class }} resources: requests: storage: {{ .Values.vllm.storage.size | quote }} {{- end }}