File size: 16,354 Bytes
4071f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d08fefa9-5733-47d7-80e1-9535b7102e90",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "------------------------------------------------*"
     ]
    },
    {
     "ename": "UnexpectedStatusException",
     "evalue": "Error hosting endpoint huggingface-pytorch-tgi-inference-2023-08-07-22-35-09-932: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[5], line 29\u001b[0m\n\u001b[1;32m     22\u001b[0m huggingface_model \u001b[38;5;241m=\u001b[39m HuggingFaceModel(\n\u001b[1;32m     23\u001b[0m     image_uri\u001b[38;5;241m=\u001b[39mget_huggingface_llm_image_uri(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhuggingface\u001b[39m\u001b[38;5;124m\"\u001b[39m,version\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0.8.2\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m     24\u001b[0m     env\u001b[38;5;241m=\u001b[39mhub,\n\u001b[1;32m     25\u001b[0m     role\u001b[38;5;241m=\u001b[39mrole, \n\u001b[1;32m     26\u001b[0m )\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m# deploy model to SageMaker Inference\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m predictor \u001b[38;5;241m=\u001b[39m \u001b[43mhuggingface_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     30\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_instance_count\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     31\u001b[0m \u001b[43m    \u001b[49m\u001b[43minstance_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mml.g5.4xlarge\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcontainer_startup_health_check_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1200\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m  \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/huggingface/model.py:313\u001b[0m, in \u001b[0;36mHuggingFaceModel.deploy\u001b[0;34m(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, **kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m     inference_tool \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mneuron\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m instance_type\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mml.inf1\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mneuronx\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    307\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_uri \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mserving_image_uri(\n\u001b[1;32m    308\u001b[0m         region_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msagemaker_session\u001b[38;5;241m.\u001b[39mboto_session\u001b[38;5;241m.\u001b[39mregion_name,\n\u001b[1;32m    309\u001b[0m         instance_type\u001b[38;5;241m=\u001b[39minstance_type,\n\u001b[1;32m    310\u001b[0m         inference_tool\u001b[38;5;241m=\u001b[39minference_tool,\n\u001b[1;32m    311\u001b[0m     )\n\u001b[0;32m--> 313\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mHuggingFaceModel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    314\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_instance_count\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    315\u001b[0m \u001b[43m    \u001b[49m\u001b[43minstance_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    316\u001b[0m \u001b[43m    \u001b[49m\u001b[43mserializer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    317\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdeserializer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    318\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccelerator_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    319\u001b[0m \u001b[43m    \u001b[49m\u001b[43mendpoint_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    320\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    321\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkms_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    322\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwait\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    323\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_capture_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    324\u001b[0m \u001b[43m    \u001b[49m\u001b[43masync_inference_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    325\u001b[0m \u001b[43m    \u001b[49m\u001b[43mserverless_inference_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    326\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvolume_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvolume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    327\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_data_download_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_data_download_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    328\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcontainer_startup_health_check_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontainer_startup_health_check_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    329\u001b[0m \u001b[43m    \u001b[49m\u001b[43minference_recommendation_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_recommendation_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    330\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexplainer_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexplainer_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    331\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/model.py:1406\u001b[0m, in \u001b[0;36mModel.deploy\u001b[0;34m(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, **kwargs)\u001b[0m\n\u001b[1;32m   1403\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_explainer_enabled:\n\u001b[1;32m   1404\u001b[0m     explainer_config_dict \u001b[38;5;241m=\u001b[39m explainer_config\u001b[38;5;241m.\u001b[39m_to_request_dict()\n\u001b[0;32m-> 1406\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msagemaker_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendpoint_from_production_variants\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1407\u001b[0m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendpoint_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1408\u001b[0m \u001b[43m    \u001b[49m\u001b[43mproduction_variants\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mproduction_variant\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1409\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtags\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1410\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkms_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkms_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1411\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1412\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_capture_config_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_capture_config_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1413\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexplainer_config_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexplainer_config_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1414\u001b[0m \u001b[43m    \u001b[49m\u001b[43masync_inference_config_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masync_inference_config_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1415\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1417\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpredictor_cls:\n\u001b[1;32m   1418\u001b[0m     predictor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpredictor_cls(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mendpoint_name, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msagemaker_session)\n",
      "File \u001b[0;32m~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:4686\u001b[0m, in \u001b[0;36mSession.endpoint_from_production_variants\u001b[0;34m(self, name, production_variants, tags, kms_key, wait, data_capture_config_dict, async_inference_config_dict, explainer_config_dict)\u001b[0m\n\u001b[1;32m   4683\u001b[0m LOGGER\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCreating endpoint-config with name \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, name)\n\u001b[1;32m   4684\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msagemaker_client\u001b[38;5;241m.\u001b[39mcreate_endpoint_config(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig_options)\n\u001b[0;32m-> 4686\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_endpoint\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   4687\u001b[0m \u001b[43m    \u001b[49m\u001b[43mendpoint_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mendpoint_tags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwait\u001b[49m\n\u001b[1;32m   4688\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:4066\u001b[0m, in \u001b[0;36mSession.create_endpoint\u001b[0;34m(self, endpoint_name, config_name, tags, wait)\u001b[0m\n\u001b[1;32m   4062\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msagemaker_client\u001b[38;5;241m.\u001b[39mcreate_endpoint(\n\u001b[1;32m   4063\u001b[0m     EndpointName\u001b[38;5;241m=\u001b[39mendpoint_name, EndpointConfigName\u001b[38;5;241m=\u001b[39mconfig_name, Tags\u001b[38;5;241m=\u001b[39mtags\n\u001b[1;32m   4064\u001b[0m )\n\u001b[1;32m   4065\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wait:\n\u001b[0;32m-> 4066\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_endpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mendpoint_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4067\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m endpoint_name\n",
      "File \u001b[0;32m~/anaconda3/envs/python3/lib/python3.10/site-packages/sagemaker/session.py:4418\u001b[0m, in \u001b[0;36mSession.wait_for_endpoint\u001b[0;34m(self, endpoint, poll)\u001b[0m\n\u001b[1;32m   4412\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCapacityError\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(reason):\n\u001b[1;32m   4413\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mCapacityError(\n\u001b[1;32m   4414\u001b[0m             message\u001b[38;5;241m=\u001b[39mmessage,\n\u001b[1;32m   4415\u001b[0m             allowed_statuses\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInService\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m   4416\u001b[0m             actual_status\u001b[38;5;241m=\u001b[39mstatus,\n\u001b[1;32m   4417\u001b[0m         )\n\u001b[0;32m-> 4418\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mUnexpectedStatusException(\n\u001b[1;32m   4419\u001b[0m         message\u001b[38;5;241m=\u001b[39mmessage,\n\u001b[1;32m   4420\u001b[0m         allowed_statuses\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInService\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m   4421\u001b[0m         actual_status\u001b[38;5;241m=\u001b[39mstatus,\n\u001b[1;32m   4422\u001b[0m     )\n\u001b[1;32m   4423\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m desc\n",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-08-07-22-35-09-932: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.."
     ]
    }
   ],
   "source": [
    "import json\n",
    "import sagemaker\n",
    "import boto3\n",
    "from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri\n",
    "\n",
    "try:\n",
    "    role = sagemaker.get_execution_role()\n",
    "except ValueError:\n",
    "    iam = boto3.client('iam')\n",
    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
    "\n",
    "# Hub Model configuration. https://huggingface.co/models\n",
    "hub = {\n",
    "    'HF_MODEL_ID':'meta-llama/Llama-2-13b-chat-hf',\n",
    "    'SM_NUM_GPUS': json.dumps(1),\n",
    "    'HUGGING_FACE_HUB_TOKEN': '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'\n",
    "}\n",
    "\n",
    "assert hub['HUGGING_FACE_HUB_TOKEN'] != '<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>', \"You have to provide a token.\"\n",
    "\n",
    "# create Hugging Face Model Class\n",
    "huggingface_model = HuggingFaceModel(\n",
    "    image_uri=get_huggingface_llm_image_uri(\"huggingface\",version=\"0.8.2\"),\n",
    "    env=hub,\n",
    "    role=role, \n",
    ")\n",
    "\n",
    "# deploy model to SageMaker Inference\n",
    "predictor = huggingface_model.deploy(\n",
    "    initial_instance_count=1,\n",
    "    instance_type=\"ml.g5.4xlarge\",\n",
    "    container_startup_health_check_timeout=1200,\n",
    "  )\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e35b51f-6195-415a-a52f-3ac6d744bd01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# send request\n",
    "predictor.predict({\n",
    "    \"inputs\": \"My name is Julien and I like to\",\n",
    "})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}