$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json name: bloom-deployment endpoint_name: bloom-inference model: azureml:bloom-safetensors:1 model_mount_path: /var/azureml-model environment_variables: WEIGHTS_CACHE_OVERRIDE: /var/azureml-model/bloom-safetensors MODEL_ID: bigscience/bloom NUM_SHARD: 8 environment: image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.2.0 inference_config: liveness_route: port: 80 path: /health readiness_route: port: 80 path: /health scoring_route: port: 80 path: /generate instance_type: Standard_ND96amsr_A100_v4 request_settings: request_timeout_ms: 90000 max_concurrent_requests_per_instance: 256 liveness_probe: initial_delay: 600 timeout: 90 period: 120 success_threshold: 1 failure_threshold: 5 readiness_probe: initial_delay: 600 timeout: 90 period: 120 success_threshold: 1 failure_threshold: 5 instance_count: 1