meg-huggingface commited on
Commit
a9f6487
·
1 Parent(s): e79b5e9

Trying to handle endpoint errors

Browse files
Files changed (1) hide show
  1. src/backend/inference_endpoint.py +47 -28
src/backend/inference_endpoint.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  from huggingface_hub import create_inference_endpoint, get_inference_endpoint
6
  from src.backend.run_toxicity_eval import get_generation
7
  from src.logging import setup_logger
 
8
  logging.basicConfig(level=logging.DEBUG)
9
  logger = setup_logger(__name__)
10
  TIMEOUT=20
@@ -17,11 +18,53 @@ def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-g
17
  endpoint = create_inference_endpoint(endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type
18
  )
19
  except huggingface_hub.utils._errors.HfHubHTTPError as e:
20
- logger.debug("Hit the following exception:")
 
 
 
 
 
 
 
 
 
 
21
  logger.debug(e)
22
- logger.debug("Attempting to continue.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  endpoint = get_inference_endpoint(endpoint_name)
24
- endpoint.update(repository=repository, framework=framework, task=task, accelerator=accelerator, instance_size=instance_size, instance_type=instance_type)
 
 
25
  except huggingface_hub.utils._errors.BadRequestError as e:
26
  logger.debug("Hit the following exception:")
27
  logger.debug(e)
@@ -48,31 +91,7 @@ def create_endpoint(endpoint_name, repository, framework="pytorch", task="text-g
48
  else:
49
  logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
50
  sys.exit()
51
- except Exception as e:
52
- logger.debug("Hit error")
53
- logger.debug(e)
54
- sys.exit()
55
-
56
- endpoint.fetch()
57
- logger.info("Endpoint status: %s." % (endpoint.status))
58
- if endpoint.status == "scaledToZero":
59
- # Send a request to wake it up.
60
- get_generation(endpoint.url, "Wake up")
61
- sleep(TIMEOUT)
62
- i = 0
63
- while endpoint.status in ["pending", "initializing"]:# aka, not in ["failed", "running"]
64
- if i >= 20:
65
- logger.info("Model failed to respond. Exiting.")
66
- sys.exit()
67
- logger.debug("Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
68
- sleep(TIMEOUT)
69
- endpoint.fetch()
70
- logger.debug("Endpoint status: %s." % (endpoint.status))
71
- i += 1
72
- logger.info("Endpoint created:")
73
- logger.info(endpoint)
74
- generation_url = endpoint.url
75
- return generation_url
76
 
77
 
78
  if __name__ == '__main__':
 
5
  from huggingface_hub import create_inference_endpoint, get_inference_endpoint
6
  from src.backend.run_toxicity_eval import get_generation
7
  from src.logging import setup_logger
8
+ import requests
9
  logging.basicConfig(level=logging.DEBUG)
10
  logger = setup_logger(__name__)
11
  TIMEOUT=20
 
18
  endpoint = create_inference_endpoint(endpoint_name, repository=repository, framework=framework, task=task, accelerator=accelerator, vendor=vendor, region=region, type=type, instance_size=instance_size, instance_type=instance_type
19
  )
20
  except huggingface_hub.utils._errors.HfHubHTTPError as e:
21
+ endpoint = update_endpoint_exception(accelerator, e, endpoint,
22
+ endpoint_name, framework,
23
+ instance_size, instance_type,
24
+ repository, task)
25
+ except requests.exceptions.HTTPError as e:
26
+ endpoint = update_endpoint_exception(accelerator, e, endpoint,
27
+ endpoint_name, framework,
28
+ instance_size, instance_type,
29
+ repository, task)
30
+ except Exception as e:
31
+ logger.debug("Hit error")
32
  logger.debug(e)
33
+ sys.exit()
34
+
35
+ endpoint.fetch()
36
+ logger.info("Endpoint status: %s." % (endpoint.status))
37
+ if endpoint.status == "scaledToZero":
38
+ # Send a request to wake it up.
39
+ get_generation(endpoint.url, "Wake up")
40
+ sleep(TIMEOUT)
41
+ i = 0
42
+ while endpoint.status in ["pending", "initializing"]:# aka, not in ["failed", "running"]
43
+ if i >= 20:
44
+ logger.info("Model failed to respond. Exiting.")
45
+ sys.exit()
46
+ logger.debug("Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
47
+ sleep(TIMEOUT)
48
+ endpoint.fetch()
49
+ logger.debug("Endpoint status: %s." % (endpoint.status))
50
+ i += 1
51
+ logger.info("Endpoint created:")
52
+ logger.info(endpoint)
53
+ generation_url = endpoint.url
54
+ return generation_url
55
+
56
+
57
+ def update_endpoint_exception(accelerator, e, endpoint, endpoint_name,
58
+ framework, instance_size, instance_type,
59
+ repository, task):
60
+ logger.debug("Hit the following exception:")
61
+ logger.debug(e)
62
+ logger.debug("Attempting to continue.")
63
+ try:
64
  endpoint = get_inference_endpoint(endpoint_name)
65
+ endpoint.update(repository=repository, framework=framework, task=task,
66
+ accelerator=accelerator, instance_size=instance_size,
67
+ instance_type=instance_type)
68
  except huggingface_hub.utils._errors.BadRequestError as e:
69
  logger.debug("Hit the following exception:")
70
  logger.debug(e)
 
91
  else:
92
  logger.info("Getting expensive to try to run this model without human oversight. Exiting.")
93
  sys.exit()
94
+ return endpoint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
  if __name__ == '__main__':