meg-huggingface
commited on
Commit
·
a9f6487
1
Parent(s):
e79b5e9
Trying to handle endpoint errors
Browse files
src/backend/inference_endpoint.py
CHANGED
@@ -5,6 +5,7 @@ import logging
|
|
5 |
from huggingface_hub import create_inference_endpoint, get_inference_endpoint
|
6 |
from src.backend.run_toxicity_eval import get_generation
|
7 |
from src.logging import setup_logger
|
|
|
8 |
logging.basicConfig(level=logging.DEBUG)
|
9 |
logger = setup_logger(__name__)
|
10 |
TIMEOUT=20
|
@@ -17,11 +18,53 @@ def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-g
|
|
17 |
endpoint = create_inference_endpoint(endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type
|
18 |
)
|
19 |
except huggingface_hub.utils._errors.HfHubHTTPError as e:
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
logger.debug(e)
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
endpoint = get_inference_endpoint(endpoint_name)
|
24 |
-
endpoint.update(repository=repository, framework=framework, task=task,
|
|
|
|
|
25 |
except huggingface_hub.utils._errors.BadRequestError as e:
|
26 |
logger.debug("Hit the following exception:")
|
27 |
logger.debug(e)
|
@@ -48,31 +91,7 @@ def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-g
|
|
48 |
else:
|
49 |
logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
|
50 |
sys.exit()
|
51 |
-
|
52 |
-
logger.debug("Hit error")
|
53 |
-
logger.debug(e)
|
54 |
-
sys.exit()
|
55 |
-
|
56 |
-
endpoint.fetch()
|
57 |
-
logger.info("Endpoint status: %s." % (endpoint.status))
|
58 |
-
if endpoint.status == "scaledToZero":
|
59 |
-
# Send a request to wake it up.
|
60 |
-
get_generation(endpoint.url, "Wake up")
|
61 |
-
sleep(TIMEOUT)
|
62 |
-
i = 0
|
63 |
-
while endpoint.status in ["pending", "initializing"]:# aka, not in ["failed", "running"]
|
64 |
-
if i >= 20:
|
65 |
-
logger.info("Model failed to respond. Exiting.")
|
66 |
-
sys.exit()
|
67 |
-
logger.debug("Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
|
68 |
-
sleep(TIMEOUT)
|
69 |
-
endpoint.fetch()
|
70 |
-
logger.debug("Endpoint status: %s." % (endpoint.status))
|
71 |
-
i += 1
|
72 |
-
logger.info("Endpoint created:")
|
73 |
-
logger.info(endpoint)
|
74 |
-
generation_url = endpoint.url
|
75 |
-
return generation_url
|
76 |
|
77 |
|
78 |
if __name__ == '__main__':
|
|
|
5 |
from huggingface_hub import create_inference_endpoint, get_inference_endpoint
|
6 |
from src.backend.run_toxicity_eval import get_generation
|
7 |
from src.logging import setup_logger
|
8 |
+
import requests
|
9 |
logging.basicConfig(level=logging.DEBUG)
|
10 |
logger = setup_logger(__name__)
|
11 |
TIMEOUT=20
|
|
|
18 |
endpoint = create_inference_endpoint(endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type
|
19 |
)
|
20 |
except huggingface_hub.utils._errors.HfHubHTTPError as e:
|
21 |
+
endpoint = update_endpoint_exception(accelerator, e, endpoint,
|
22 |
+
endpoint_name, framework,
|
23 |
+
instance_size, instance_type,
|
24 |
+
repository, task)
|
25 |
+
except requests.exceptions.HTTPError as e:
|
26 |
+
endpoint = update_endpoint_exception(accelerator, e, endpoint,
|
27 |
+
endpoint_name, framework,
|
28 |
+
instance_size, instance_type,
|
29 |
+
repository, task)
|
30 |
+
except Exception as e:
|
31 |
+
logger.debug("Hit error")
|
32 |
logger.debug(e)
|
33 |
+
sys.exit()
|
34 |
+
|
35 |
+
endpoint.fetch()
|
36 |
+
logger.info("Endpoint status: %s." % (endpoint.status))
|
37 |
+
if endpoint.status == "scaledToZero":
|
38 |
+
# Send a request to wake it up.
|
39 |
+
get_generation(endpoint.url, "Wake up")
|
40 |
+
sleep(TIMEOUT)
|
41 |
+
i = 0
|
42 |
+
while endpoint.status in ["pending", "initializing"]:# aka, not in ["failed", "running"]
|
43 |
+
if i >= 20:
|
44 |
+
logger.info("Model failed to respond. Exiting.")
|
45 |
+
sys.exit()
|
46 |
+
logger.debug("Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
|
47 |
+
sleep(TIMEOUT)
|
48 |
+
endpoint.fetch()
|
49 |
+
logger.debug("Endpoint status: %s." % (endpoint.status))
|
50 |
+
i += 1
|
51 |
+
logger.info("Endpoint created:")
|
52 |
+
logger.info(endpoint)
|
53 |
+
generation_url = endpoint.url
|
54 |
+
return generation_url
|
55 |
+
|
56 |
+
|
57 |
+
def update_endpoint_exception(accelerator, e, endpoint, endpoint_name,
|
58 |
+
framework, instance_size, instance_type,
|
59 |
+
repository, task):
|
60 |
+
logger.debug("Hit the following exception:")
|
61 |
+
logger.debug(e)
|
62 |
+
logger.debug("Attempting to continue.")
|
63 |
+
try:
|
64 |
endpoint = get_inference_endpoint(endpoint_name)
|
65 |
+
endpoint.update(repository=repository, framework=framework, task=task,
|
66 |
+
accelerator=accelerator, instance_size=instance_size,
|
67 |
+
instance_type=instance_type)
|
68 |
except huggingface_hub.utils._errors.BadRequestError as e:
|
69 |
logger.debug("Hit the following exception:")
|
70 |
logger.debug(e)
|
|
|
91 |
else:
|
92 |
logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
|
93 |
sys.exit()
|
94 |
+
return endpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
|
97 |
if __name__ == '__main__':
|