davanstrien HF staff commited on
Commit
14c6f3f
1 Parent(s): 49ef0df

cache hub checks

Browse files
Files changed (1) hide show
  1. app.py +42 -22
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import contextlib
2
- from typing import Literal, Tuple, Dict, List
3
  import httpx
4
  import nbformat
5
  from nbformat import NotebookNode, ValidationError
@@ -14,26 +14,45 @@ import re
14
  from traitlets.config import Config
15
  from huggingface_hub import model_info, dataset_info
16
  from huggingface_hub.utils import RepositoryNotFoundError
 
17
 
18
  hub_id_regex = re.compile(r"[^\w]([a-zA-Z\d-]{3,32}\/[\w\-._]{3,64})[^\w/]")
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class HubIDCell(Preprocessor):
22
  def preprocess_cell(self, cell, resources, index):
23
  if cell["cell_type"] == "code":
24
- resources.setdefault("dataset_matches", [])
25
- resources.setdefault("model_matches", [])
26
- match = re.search(hub_id_regex, cell["source"])
27
- if match:
28
  hub_id_match = match.groups(0)[0]
29
- print(hub_id_match)
30
- try:
31
- model = model_info(hub_id_match)
32
- resources["model_matches"].append(model.modelId)
33
- except RepositoryNotFoundError:
34
- with contextlib.suppress(RepositoryNotFoundError):
35
- dataset = dataset_info(hub_id_match)
36
- resources["dataset_matches"].append(dataset.id)
37
  return cell, resources
38
 
39
 
@@ -52,6 +71,7 @@ async def healthz(_):
52
  return JSONResponse({"success": True})
53
 
54
 
 
55
  def convert(
56
  s: str, theme: Literal["light", "dark"], debug_info: str
57
  ) -> Tuple[str, List[str], List[str]]:
@@ -63,7 +83,7 @@ def convert(
63
  )
64
  except nbformat.reader.NotJSONError:
65
  print(400, f"Notebook is not JSON. {debug_info}")
66
- raise HTTPException(400, f"Notebook is not JSON.")
67
  except ValidationError as e:
68
  print(
69
  400,
@@ -117,14 +137,14 @@ async def convert_from_url(req: Request):
117
  html_text, model_matches, dataset_matches = convert(
118
  r.text, theme=theme, debug_info=f"url={url}"
119
  )
120
- return HTMLResponse(content=html_text)
121
- # return JSONResponse(
122
- # content={
123
- # "html": html_text,
124
- # "model_matches": model_matches,
125
- # "dataset_matches": dataset_matches,
126
- # }
127
- # )
128
 
129
 
130
  async def convert_from_upload(req: Request):
 
1
  import contextlib
2
+ from typing import Literal, Tuple, List
3
  import httpx
4
  import nbformat
5
  from nbformat import NotebookNode, ValidationError
 
14
  from traitlets.config import Config
15
  from huggingface_hub import model_info, dataset_info
16
  from huggingface_hub.utils import RepositoryNotFoundError
17
+ from functools import lru_cache
18
 
19
  hub_id_regex = re.compile(r"[^\w]([a-zA-Z\d-]{3,32}\/[\w\-._]{3,64})[^\w/]")
20
 
21
 
22
+ @lru_cache(
23
+ maxsize=4096
24
+ ) # TODO possibly make async but might be tricky to call inside PreProcessor
25
+ def check_hub_item(hub_id_match):
26
+ with contextlib.suppress(RepositoryNotFoundError):
27
+ model_info(hub_id_match)
28
+ return hub_id_match, "model"
29
+ with contextlib.suppress(RepositoryNotFoundError):
30
+ dataset_info(hub_id_match)
31
+ return hub_id_match, "dataset"
32
+
33
+
34
+ # async def check_repo_exists(regex_hub_id_match: str) -> Optional[Tuple[str, str]]:
35
+ # r = await client.get(f"https://huggingface.co/api/models/{regex_hub_id_match}")
36
+ # if r.status_code == 200:
37
+ # return regex_hub_id_match, 'model'
38
+ # r = await client.get(f"https://huggingface.co/api/datasets/{regex_hub_id_match}")
39
+ # if r.status_code == 200:
40
+ # return regex_hub_id_match, 'dataset'
41
+
42
+
43
  class HubIDCell(Preprocessor):
44
  def preprocess_cell(self, cell, resources, index):
45
  if cell["cell_type"] == "code":
46
+ resources.setdefault("dataset_matches", set())
47
+ resources.setdefault("model_matches", set())
48
+ if match := re.search(hub_id_regex, cell["source"]):
 
49
  hub_id_match = match.groups(0)[0]
50
+ if hub_check := check_hub_item(hub_id_match):
51
+ hub_id_match, repo_item_type = hub_check
52
+ if repo_item_type == "model":
53
+ resources["model_matches"].add(hub_id_match)
54
+ if repo_item_type == "dataset":
55
+ resources["dataset_matches"].add(hub_id_match)
 
 
56
  return cell, resources
57
 
58
 
 
71
  return JSONResponse({"success": True})
72
 
73
 
74
+ @lru_cache(maxsize=2048)
75
  def convert(
76
  s: str, theme: Literal["light", "dark"], debug_info: str
77
  ) -> Tuple[str, List[str], List[str]]:
 
83
  )
84
  except nbformat.reader.NotJSONError:
85
  print(400, f"Notebook is not JSON. {debug_info}")
86
+ raise HTTPException(400, "Notebook is not JSON.")
87
  except ValidationError as e:
88
  print(
89
  400,
 
137
  html_text, model_matches, dataset_matches = convert(
138
  r.text, theme=theme, debug_info=f"url={url}"
139
  )
140
+ # return HTMLResponse(content=html_text)
141
+ return JSONResponse(
142
+ content={
143
+ "html": html_text,
144
+ "model_matches": list(model_matches),
145
+ "dataset_matches": list(dataset_matches),
146
+ }
147
+ )
148
 
149
 
150
  async def convert_from_upload(req: Request):