Molbap HF Staff commited on
Commit
a12858e
Β·
1 Parent(s): 6d106b8
Files changed (2) hide show
  1. app.py +4 -8
  2. build_cache.py +24 -71
app.py CHANGED
@@ -28,21 +28,16 @@ def _escape_srcdoc(text: str) -> str:
28
  )
29
 
30
  def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
31
- """Fetch cached data from Molbap/hf_cached_embeds_log repo."""
32
-
33
  repo_id = "Molbap/hf_cached_embeds_log"
34
  try:
35
- latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json")
36
  info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
37
  sha = info.get("sha")
38
  key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
39
-
40
- html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html")
41
- json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json")
42
-
43
  raw_html = Path(html_fp).read_text(encoding="utf-8")
44
  json_text = Path(json_fp).read_text(encoding="utf-8")
45
-
46
  iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
47
  tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
48
  tmp.write_text(json_text, encoding="utf-8")
@@ -50,6 +45,7 @@ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimo
50
  except Exception:
51
  return None
52
 
 
53
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
54
 
55
  # ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
 
28
  )
29
 
30
  def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
 
 
31
  repo_id = "Molbap/hf_cached_embeds_log"
32
  try:
33
+ latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
34
  info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
35
  sha = info.get("sha")
36
  key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
37
+ html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
38
+ json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
 
 
39
  raw_html = Path(html_fp).read_text(encoding="utf-8")
40
  json_text = Path(json_fp).read_text(encoding="utf-8")
 
41
  iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
42
  tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
43
  tmp.write_text(json_text, encoding="utf-8")
 
45
  except Exception:
46
  return None
47
 
48
+
49
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
50
 
51
  # ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
build_cache.py CHANGED
@@ -1,107 +1,60 @@
1
- import os
2
- import json
3
- import subprocess
4
- import tempfile
5
  from pathlib import Path
6
  from datetime import datetime, timezone
7
  from huggingface_hub import HfApi
8
 
9
  from modular_graph_and_candidates import (
10
- build_graph_json,
11
- generate_html,
12
- build_timeline_json,
13
- generate_timeline_html
14
  )
15
 
16
- REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
17
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
18
- THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
19
- MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
20
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
21
 
22
  def main():
23
- print(f"Building cache for {REPO_URL}")
24
- print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")
25
-
26
  tmp = Path(tempfile.mkdtemp())
27
- print(f"Working in {tmp}")
28
-
29
- print("Cloning repository...")
30
- subprocess.check_call([
31
- "git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
32
- ])
33
-
34
- sha = subprocess.check_output([
35
- "git", "rev-parse", "HEAD"
36
- ], cwd=tmp / "repo", text=True).strip()
37
-
38
- print(f"Repository SHA: {sha}")
39
-
40
  repo_path = tmp / "repo"
41
 
42
- print("Building graph...")
43
- graph = build_graph_json(
44
- transformers_dir=repo_path,
45
- threshold=THRESH,
46
- multimodal=MULTIMODAL,
47
- sim_method=SIM_METHOD,
48
- )
49
-
50
- print("Building timeline...")
51
- timeline = build_timeline_json(
52
- transformers_dir=repo_path,
53
- threshold=THRESH,
54
- multimodal=MULTIMODAL,
55
- sim_method=SIM_METHOD,
56
- )
57
-
58
- print("Generating HTML...")
59
  graph_html = generate_html(graph)
60
  timeline_html = generate_timeline_html(timeline)
61
 
62
- print(f"Uploading to {CACHE_REPO}...")
63
-
64
  api = HfApi()
 
65
 
66
  key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
67
-
68
  latest = {
69
  "sha": sha,
70
  "updated_utc": datetime.now(timezone.utc).isoformat(),
71
- "defaults": {
72
- "sim_method": SIM_METHOD,
73
- "threshold": THRESH,
74
- "multimodal": MULTIMODAL
75
- },
76
  "paths": {
77
- "graph_json": f"graph/{key}.json",
78
- "graph_html": f"graph/{key}.html",
79
  "timeline_json": f"timeline/{key}.json",
80
  "timeline_html": f"timeline/{key}.html",
81
  },
82
  }
83
 
84
- files_to_upload = [
85
- (f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
86
- (f"graph/{key}.html", graph_html),
87
- (f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
88
- (f"timeline/{key}.html", timeline_html),
89
- ("latest.json", json.dumps(latest, separators=(',', ':'))),
90
- ]
91
-
92
- for path_in_repo, content in files_to_upload:
93
- temp_file = tmp / "upload_temp"
94
- temp_file.write_text(content, encoding="utf-8")
95
-
96
  api.upload_file(
97
- path_or_fileobj=str(temp_file),
98
  path_in_repo=path_in_repo,
99
  repo_id=CACHE_REPO,
100
- commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
 
101
  )
102
- print(f"Uploaded {path_in_repo}")
103
 
104
- print(f"Successfully uploaded cache for {key}")
 
 
 
 
105
 
106
  if __name__ == "__main__":
107
- main()
 
1
+ import os, json, subprocess, tempfile, io
 
 
 
2
  from pathlib import Path
3
  from datetime import datetime, timezone
4
  from huggingface_hub import HfApi
5
 
6
  from modular_graph_and_candidates import (
7
+ build_graph_json, generate_html,
8
+ build_timeline_json, generate_timeline_html
 
 
9
  )
10
 
11
+ REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
12
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
13
+ THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
14
+ MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
15
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
16
 
17
  def main():
 
 
 
18
  tmp = Path(tempfile.mkdtemp())
19
+ subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
20
+ sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
 
 
 
 
 
 
 
 
 
 
 
21
  repo_path = tmp / "repo"
22
 
23
+ graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
24
+ timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  graph_html = generate_html(graph)
26
  timeline_html = generate_timeline_html(timeline)
27
 
 
 
28
  api = HfApi()
29
+ api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
30
 
31
  key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
 
32
  latest = {
33
  "sha": sha,
34
  "updated_utc": datetime.now(timezone.utc).isoformat(),
35
+ "defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
 
 
 
 
36
  "paths": {
37
+ "graph_json": f"graph/{key}.json",
38
+ "graph_html": f"graph/{key}.html",
39
  "timeline_json": f"timeline/{key}.json",
40
  "timeline_html": f"timeline/{key}.html",
41
  },
42
  }
43
 
44
+ def put(path_in_repo: str, text: str):
 
 
 
 
 
 
 
 
 
 
 
45
  api.upload_file(
46
+ path_or_fileobj=io.BytesIO(text.encode("utf-8")),
47
  path_in_repo=path_in_repo,
48
  repo_id=CACHE_REPO,
49
+ repo_type="dataset",
50
+ commit_message=f"cache {path_in_repo}",
51
  )
 
52
 
53
+ put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
54
+ put(f"graph/{key}.html", graph_html)
55
+ put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
56
+ put(f"timeline/{key}.html", timeline_html)
57
+ put("latest.json", json.dumps(latest, separators=(",", ":")))
58
 
59
  if __name__ == "__main__":
60
+ main()