Help! Unable to deploy Falcon 7b Instruct on AWS Sagemaker

#59
by np05 - opened

Hello,

  1. I have been trying to deploy Falcon 7b instruct model on AWS Sagemaker. I followed the steps listed in the blog post : https://www.philschmid.de/sagemaker-falcon-llm.

  2. My configuration is as follows:
    import json
    from sagemaker.huggingface import HuggingFaceModel

    #sagemaker config#
    instance_type = "ml.g5.12xlarge"
    number_of_gpu = 4
    health_check_timeout = 300

    #TGI config#
    config = {
    'HF_MODEL_ID': "tiiuae/falcon-7b-instruct", # model_id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
    'MAX_INPUT_LENGTH': json.dumps(500), # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(1000), # Max length of the generation (including input text)
    #'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
    }

    #create HuggingFaceModel
    llm_model = HuggingFaceModel(
    role=role,
    image_uri=llm_image,
    env=config
    )

  3. Deploy the model to an endpoint

#https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
initial_instance_count=1,
instance_type=instance_type,
#volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

  1. Please see the error I get as below:

UnexpectedStatusException Traceback (most recent call last)
Cell In[6], line 3
1 # Deploy model to an endpoint
2 # https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
----> 3 llm = llm_model.deploy(
4 initial_instance_count=1,
5 instance_type=instance_type,
6 # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
7 container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
8 )

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/huggingface/model.py:311, in HuggingFaceModel.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, **kwargs)
305 if not self.image_uri and instance_type is not None and instance_type.startswith("ml.inf"):
306 self.image_uri = self.serving_image_uri(
307 region_name=self.sagemaker_session.boto_session.region_name,
308 instance_type=instance_type,
309 )
--> 311 return super(HuggingFaceModel, self).deploy(
312 initial_instance_count,
313 instance_type,
314 serializer,
315 deserializer,
316 accelerator_type,
317 endpoint_name,
318 tags,
319 kms_key,
320 wait,
321 data_capture_config,
322 async_inference_config,
323 serverless_inference_config,
324 volume_size=volume_size,
325 model_data_download_timeout=model_data_download_timeout,
326 container_startup_health_check_timeout=container_startup_health_check_timeout,
327 inference_recommendation_id=inference_recommendation_id,
328 explainer_config=explainer_config,
329 )

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:1328, in Model.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, **kwargs)
1325 if is_explainer_enabled:
1326 explainer_config_dict = explainer_config._to_request_dict()
-> 1328 self.sagemaker_session.endpoint_from_production_variants(
1329 name=self.endpoint_name,
1330 production_variants=[production_variant],
1331 tags=tags,
1332 kms_key=kms_key,
1333 wait=wait,
1334 data_capture_config_dict=data_capture_config_dict,
1335 explainer_config_dict=explainer_config_dict,
1336 async_inference_config_dict=async_inference_config_dict,
1337 )
1339 if self.predictor_cls:
1340 predictor = self.predictor_cls(self.endpoint_name, self.sagemaker_session)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:4577, in Session.endpoint_from_production_variants(self, name, production_variants, tags, kms_key, wait, data_capture_config_dict, async_inference_config_dict, explainer_config_dict)
4574 LOGGER.info("Creating endpoint-config with name %s", name)
4575 self.sagemaker_client.create_endpoint_config(**config_options)
-> 4577 return self.create_endpoint(endpoint_name=name, config_name=name, tags=tags, wait=wait)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:3970, in Session.create_endpoint(self, endpoint_name, config_name, tags, wait)
3966 self.sagemaker_client.create_endpoint(
3967 EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
3968 )
3969 if wait:
-> 3970 self.wait_for_endpoint(endpoint_name)
3971 return endpoint_name

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:4322, in Session.wait_for_endpoint(self, endpoint, poll)
4316 if "CapacityError" in str(reason):
4317 raise exceptions.CapacityError(
4318 message=message,
4319 allowed_statuses=["InService"],
4320 actual_status=status,
4321 )
-> 4322 raise exceptions.UnexpectedStatusException(
4323 message=message,
4324 allowed_statuses=["InService"],
4325 actual_status=status,
4326 )
4327 return desc

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-07-11-00-28-36-511: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

hi I am getting the same error
did u solve it?

@pranavnerurkar : nope it is not solved

Sign up or log in to comment