Spaces:

iblfe
/

test

Runtime error

File size: 22,213 Bytes

b585c7f

{{- if and .Values.vllm.enabled .Values.tgi.enabled }}
  {{- fail "Both TGI and vLLM cannot be enabled at the same time. Enable only one and try again" }}
{{- end }}
{{- if .Values.h2ogpt.stack.enabled }}
  {{- if not (and .Values.vllm.enabled .Values.h2ogpt.enabled) }}
    {{- fail "If h2oGPT stack is enabled, both vLLM and h2oGPT should be enabled" }}
  {{- end }}
{{- end }}
---
{{- if .Values.h2ogpt.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}
  labels:
    app: {{ include "h2ogpt.fullname" . }}
spec:
  {{- if not .Values.h2ogpt.autoscaling.enabled }}
  replicas: {{ .Values.h2ogpt.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}
  {{- if .Values.h2ogpt.updateStrategy }}
  strategy: {{- toYaml .Values.h2ogpt.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.h2ogpt.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}
    spec:
      {{- with .Values.h2ogpt.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.h2ogpt.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.h2ogpt.podSecurityContext | nindent 8 }}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
{{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }}
      initContainers:
        - name: tgi-check
          securityContext:
            {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
          image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}"
          imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}}
          command: ["/bin/sh", "-c"]
          args:
            - >
              until wget -O- http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}/ >/dev/null 2>&1;
              do
                echo "Waiting for inference service to become ready...";
                sleep 5;
              done
{{- end }}
{{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled ) }}
      initContainers:
        - name: vllm-check
          securityContext:
            {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
          image: "{{ .Values.h2ogpt.initImage.repository | default "busybox" }}:{{ .Values.h2ogpt.initImage.tag | default "1.36" }}"
          imagePullPolicy: {{ .Values.h2ogpt.initImage.pullPolicy | default "IfNotPresent"}}
          command: ["/bin/sh", "-c"]
          args:
            - >
              until wget -O- http://{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}/v1/models >/dev/null 2>&1;
              do
                echo "Waiting for inference service to become ready...";
                sleep 5;
              done
{{- end }}
      {{- with .Values.h2ogpt.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference
          securityContext:
            {{- toYaml .Values.vllm.securityContext | nindent 12 }}
          image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
          command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
          args: 
            - "-m" 
            - "vllm.entrypoints.openai.api_server"
            - "--port"
            - "5000"
            - "--host"
            - "0.0.0.0"
            - "--download-dir"
            - "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 5000
              protocol: TCP
          {{- if .Values.vllm.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.vllm.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.vllm.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
          env:
          - name: NCCL_IGNORE_DISABLED_P2P
            value: "1"
          {{- range $key, $value := .Values.vllm.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /dev/shm
              subPath: shm
          {{- end }}
        - name: {{ include "h2ogpt.fullname" . }}
          securityContext:
            {{- toYaml .Values.h2ogpt.securityContext | nindent 12 }}
          image: "{{ .Values.h2ogpt.image.repository }}:{{ .Values.h2ogpt.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.h2ogpt.image.pullPolicy }}
          command: ["/bin/bash", "-c"]
          {{- if .Values.h2ogpt.stack.enabled }}
          args:
          - >
            while [[ "$(curl --insecure -s -o /dev/null -w ''%{http_code}''
            http://localhost:5000/v1/models)" != "200" ]]; do
              echo "Waiting for inference service to become ready... (2sec)"
              sleep 2
            done

            python3 /workspace/generate.py
          {{- end }}
          {{- if not .Values.h2ogpt.stack.enabled }}
          args:
          - >
            python3 /workspace/generate.py
          {{- end }}
          ports:
            - name: http
              containerPort: 7860
              protocol: TCP
            - name: gpt
              containerPort: 8888
              protocol: TCP
          {{- if .Values.h2ogpt.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.h2ogpt.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.h2ogpt.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.h2ogpt.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.h2ogpt.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-config
          env:
          {{- if and .Values.tgi.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
          - name: h2ogpt_inference_server
            value: "http://{{ include "h2ogpt.fullname" . }}-tgi-inference:{{ .Values.tgi.service.port }}"
          {{- end }}
          {{- if and .Values.vllm.enabled (not .Values.h2ogpt.externalLLM.enabled) (not .Values.h2ogpt.stack.enabled ) }}
          - name: h2ogpt_inference_server
            value: "vllm:{{ include "h2ogpt.fullname" . }}-vllm-inference:{{ .Values.vllm.service.port }}"
          {{- end }}
          {{- if and .Values.h2ogpt.stack.enabled (not .Values.h2ogpt.externalLLM.enabled)  }}
          - name: h2ogpt_inference_server
            value: "vllm:localhost:5000"
          {{- end }}
          {{- range $key, $value := .Values.h2ogpt.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.openAIAzure.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: OPENAI_AZURE_KEY
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_AZURE_KEY
          - name: OPENAI_AZURE_API_BASE
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_AZURE_API_BASE
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.openAI.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: OPENAI_API_KEY
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: OPENAI_API_KEY
          {{- end }}
          {{- if and .Values.h2ogpt.externalLLM.replicate.enabled .Values.h2ogpt.externalLLM.enabled }}
          - name: REPLICATE_API_TOKEN
            valueFrom:
              secretKeyRef:
                name: {{ .Values.h2ogpt.externalLLM.secret }}
                key: REPLICATE_API_TOKEN
          {{- end }}
          {{- if .Values.h2ogpt.externalLLM.enabled }}
          - name: H2OGPT_MODEL_LOCK
            value: {{ toJson .Values.h2ogpt.externalLLM.modelLock | quote }}
          - name: H2OGPT_SCORE_MODEL
            value: None
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-volume
              mountPath: /workspace/save
              subPath: save
      volumes:
        - name: {{ include "h2ogpt.fullname" . }}-volume
          {{- if not .Values.h2ogpt.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName:  {{ include "h2ogpt.fullname" . }}-volume          
          {{- else}}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.h2ogpt.storage.size | quote }}
                storageClassName: {{ .Values.h2ogpt.storage.class }}
          {{- end }}
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
        {{- end }}
{{- end }}
---
{{- if and (.Values.h2ogpt.enabled) (not .Values.h2ogpt.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-volume
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
  storageClassName: {{ .Values.h2ogpt.storage.class }}
  resources:
    requests:
      storage: {{ .Values.h2ogpt.storage.size | quote }}
{{- end }}

---
{{- if and (.Values.tgi.enabled) (not .Values.h2ogpt.stack.enabled ) }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}-tgi-inference
  labels:
    app: {{ include "h2ogpt.fullname" . }}-tgi-inference
spec:
  {{- if not .Values.tgi.autoscaling.enabled }}
  replicas: {{ .Values.tgi.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}-tgi-inference
  {{- if .Values.tgi.updateStrategy }}
  strategy: {{- toYaml .Values.tgi.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.tgi.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}-tgi-inference
    spec:
      {{- with .Values.tgi.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tgi.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.tgi.podSecurityContext | nindent 8 }}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
      {{- with .Values.tgi.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        - name: {{ include "h2ogpt.fullname" . }}-tgi-inference
          securityContext:
            {{- toYaml .Values.tgi.securityContext | nindent 12 }}
          image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag }}"
          imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
          command: []
          args: 
{{- range $arg := .Values.tgi.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 80
              protocol: TCP
          {{- if .Values.tgi.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.tgi.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.tgi.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.tgi.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.tgi.resources | nindent 12 }}
          env:
          {{- range $key, $value := .Values.tgi.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-tgi-inference-config
            - secretRef:
                name: {{ .Values.tgi.hfSecret }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /app/cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /data
              subPath: data
            - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
              mountPath: /dev/shm
              subPath: shm
      volumes:
        {{- if .Values.h2ogpt.stack.enabled }}
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
        {{- end }}
        - name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
        {{- if not .Values.tgi.storage.useEphemeral}}
          persistentVolumeClaim:
            claimName:  {{ include "h2ogpt.fullname" . }}-tgi-inference-volume      
          {{- else}}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.tgi.storage.size | quote }}
                storageClassName: {{ .Values.tgi.storage.class }}
          {{- end }}
{{- end }}
---
{{- if and (.Values.tgi.enabled) (not .Values.tgi.storage.useEphemeral)}}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-tgi-inference-volume
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.h2ogpt.storage.class | quote }}
  storageClassName: {{ .Values.tgi.storage.class }}
  resources:
    requests:
      storage: {{ .Values.tgi.storage.size | quote }}
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.h2ogpt.stack.enabled )}}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "h2ogpt.fullname" . }}-vllm-inference
  labels:
    app: {{ include "h2ogpt.fullname" . }}-vllm-inference
spec:
  {{- if not .Values.vllm.autoscaling.enabled }}
  replicas: {{ .Values.vllm.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      app: {{ include "h2ogpt.fullname" . }}-vllm-inference
  {{- if .Values.vllm.updateStrategy }}
  strategy: {{- toYaml .Values.vllm.updateStrategy | nindent 4 }}
  {{- end }}
  template:
    metadata:
      {{- with .Values.vllm.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        app: {{ include "h2ogpt.fullname" . }}-vllm-inference
    spec:
      {{- with .Values.vllm.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.vllm.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      securityContext:
        {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - {{ include "h2ogpt.fullname" . }}
                topologyKey: failure-domain.beta.kubernetes.io/zone
      {{- with .Values.vllm.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      containers:
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference
          securityContext:
            {{- toYaml .Values.vllm.securityContext | nindent 12 }}
          image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
          command: ["/h2ogpt_conda/vllm_env/bin/python3.10"]
          args: 
            - "-m" 
            - "vllm.entrypoints.openai.api_server"
            - "--port"
            - "5000"
            - "--host"
            - "0.0.0.0"
            - "--download-dir"
            - "/workspace/.cache/huggingface/hub"
{{- range $arg := .Values.vllm.containerArgs }}
            - "{{ $arg }}"
{{- end }}
          ports:
            - name: http
              containerPort: 5000
              protocol: TCP
          {{- if .Values.vllm.livenessProbe }}
          livenessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.livenessProbe | nindent 12 }}
          {{- end }}
          {{- if .Values.vllm.readinessProbe }}
          readinessProbe:
            httpGet:
              path:  /
              scheme: HTTP
              port: http
            {{- toYaml .Values.vllm.readinessProbe | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.vllm.resources | nindent 12 }}
          envFrom:
            - configMapRef:
                name: {{ include "h2ogpt.fullname" . }}-vllm-inference-config
          env:
          - name: NCCL_IGNORE_DISABLED_P2P
            value: "1"
          {{- range $key, $value := .Values.vllm.env }}
          - name: "{{ $key }}"
            value: "{{ $value }}"
          {{- end }}
          volumeMounts:
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /workspace/.cache
              subPath: cache
            - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
              mountPath: /dev/shm
              subPath: shm
      volumes:
        - name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- if not .Values.vllm.storage.useEphemeral }}
          persistentVolumeClaim:
            claimName: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
          {{- else }}
          ephemeral:
            volumeClaimTemplate:
              spec:
                accessModes: 
                  - ReadWriteOnce
                resources:
                  requests: 
                    storage: {{ .Values.vllm.storage.size | quote }}
                storageClassName: {{ .Values.vllm.storage.class }}
          {{- end }}
{{- end }}
---
{{- if and (.Values.vllm.enabled) (not .Values.vllm.storage.useEphemeral) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ include "h2ogpt.fullname" . }}-vllm-inference-volume
spec:
  accessModes:
    - ReadWriteOnce
  # storageClassName: {{ .Values.vllm.storage.class | quote }}
  storageClassName: {{ .Values.vllm.storage.class }}
  resources:
    requests:
      storage: {{ .Values.vllm.storage.size | quote }}
{{- end }}