yinuozhang commited on
Commit
6a51705
·
verified ·
1 Parent(s): 660dc20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -29
app.py CHANGED
@@ -2,22 +2,25 @@
2
  import os, shutil, subprocess
3
  from huggingface_hub import scan_cache_dir, snapshot_download
4
 
5
- # 1) Put ALL caches in /data so they’re manageable & persistent
6
- os.makedirs("/data/.cache", exist_ok=True)
 
 
7
  os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
8
  os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
9
  os.environ.setdefault("HF_HUB_CACHE", "/data/.cache/huggingface/hub")
10
- os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers")
11
- os.environ.setdefault("DATASETS_CACHE", "/data/.cache/huggingface/datasets")
12
 
13
- # 2) Prune old HF cache revisions (keeps current blobs, deletes stale revs)
14
  try:
15
  cache = scan_cache_dir(os.environ["HF_HUB_CACHE"])
16
- cache.delete_revisions([rev for rev in cache.revisions])
 
17
  except Exception as e:
18
  print(f"[cache prune] skipped: {e}")
19
 
20
- # (Optional) light guard: trim pip wheel cache
21
  try:
22
  subprocess.run(["pip", "cache", "purge"], check=False)
23
  except Exception:
@@ -29,48 +32,54 @@ import sys
29
  import pandas as pd
30
  from transformers import AutoTokenizer, AutoModel, AutoConfig
31
 
32
- # If you want fully reproducible rebuilds, set these in Space → Settings → Variables
33
- # (or leave blank to use latest)
34
  MODEL_ID = "ChatterjeeLab/MetaLATTE"
35
  TOKENIZER_ID = "facebook/esm2_t33_650M_UR50D"
36
  MODEL_REV = os.getenv("MODEL_REV", "") # e.g. "a1b2c3d"
37
  TOKENIZER_REV = os.getenv("TOKENIZER_REV", "") # e.g. "9f8e7d6"
38
 
39
- # Prefer downloading *exactly* what you need to /data and load locally.
40
- # This avoids multiple revision copies over time.
41
- def maybe_snapshot(repo_id, revision, allow_patterns):
42
- kw = dict(repo_id=repo_id, local_dir=None, ignore_regex=None)
43
- if revision:
44
- kw["revision"] = revision
45
- # Download to HF cache in /data; return the resolved local dir
46
- return snapshot_download(allow_patterns=allow_patterns, **kw)
47
-
48
- # Download tokenizer files only (small)
49
- esm_local = maybe_snapshot(
50
- TOKENIZER_ID, TOKENIZER_REV,
 
 
 
 
 
 
51
  allow_patterns=[
52
  "tokenizer.json","tokenizer_config.json","vocab.*","merges.*",
53
  "special_tokens_map.json","*.model","tokenizer*.txt","spiece.*","*.tiktoken"
54
- ]
55
  )
56
 
57
- # Download MetaLATTE (weights + config only)
58
- metalatte_local = maybe_snapshot(
59
- MODEL_ID, MODEL_REV,
60
- allow_patterns=["*.json","*.safetensors","*.bin","*.model","*.txt"] # keep it tight
 
 
61
  )
62
 
63
- # Add the current directory to the system path for your custom code
64
  metalatte_path = '.'
65
  sys.path.insert(0, metalatte_path)
66
 
67
- # Import the custom configuration and model
68
  from configuration import MetaLATTEConfig
69
  from modeling_metalatte import MultitaskProteinModel
70
  AutoConfig.register("metalatte", MetaLATTEConfig)
71
  AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
72
 
73
- # Load from the local snapshot dirs (avoids re-downloading on rebuilds)
74
  tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
75
  config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
76
  model = AutoModel.from_pretrained(metalatte_local, config=config, local_files_only=True)
 
2
  import os, shutil, subprocess
3
  from huggingface_hub import scan_cache_dir, snapshot_download
4
 
5
+ # Put caches in /data and make sure dirs exist
6
+ os.makedirs("/data/.cache/huggingface/hub", exist_ok=True)
7
+ os.makedirs("/data/snapshots", exist_ok=True)
8
+
9
  os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
10
  os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
11
  os.environ.setdefault("HF_HUB_CACHE", "/data/.cache/huggingface/hub")
12
+ # Avoid TRANSFORMERS_CACHE deprecation; HF_HOME is enough.
13
+ # os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers")
14
 
15
+ # Prune old HF cache revisions (safe if empty; now the dir exists)
16
  try:
17
  cache = scan_cache_dir(os.environ["HF_HUB_CACHE"])
18
+ if cache.revisions:
19
+ cache.delete_revisions([rev for rev in cache.revisions])
20
  except Exception as e:
21
  print(f"[cache prune] skipped: {e}")
22
 
23
+ # Light pip cache cleanup
24
  try:
25
  subprocess.run(["pip", "cache", "purge"], check=False)
26
  except Exception:
 
32
  import pandas as pd
33
  from transformers import AutoTokenizer, AutoModel, AutoConfig
34
 
35
+ # Optional: pin commits via Space Variables
 
36
  MODEL_ID = "ChatterjeeLab/MetaLATTE"
37
  TOKENIZER_ID = "facebook/esm2_t33_650M_UR50D"
38
  MODEL_REV = os.getenv("MODEL_REV", "") # e.g. "a1b2c3d"
39
  TOKENIZER_REV = os.getenv("TOKENIZER_REV", "") # e.g. "9f8e7d6"
40
 
41
+ def snapshot_to(local_name, repo_id, revision, allow_patterns):
42
+ """Download only needed files into a concrete folder under /data/snapshots."""
43
+ local_dir = f"/data/snapshots/{local_name}"
44
+ os.makedirs(local_dir, exist_ok=True)
45
+ # IMPORTANT: no ignore_regex; use ignore_patterns if needed
46
+ return snapshot_download(
47
+ repo_id=repo_id,
48
+ revision=revision if revision else None,
49
+ allow_patterns=allow_patterns,
50
+ local_dir=local_dir,
51
+ local_dir_use_symlinks=False, # copy files into local_dir; easier to manage size
52
+ )
53
+
54
+ # Tokenizer (small set of files)
55
+ esm_local = snapshot_to(
56
+ "esm2_tokenizer",
57
+ TOKENIZER_ID,
58
+ TOKENIZER_REV,
59
  allow_patterns=[
60
  "tokenizer.json","tokenizer_config.json","vocab.*","merges.*",
61
  "special_tokens_map.json","*.model","tokenizer*.txt","spiece.*","*.tiktoken"
62
+ ],
63
  )
64
 
65
+ # MetaLATTE model (weights + config only)
66
+ metalatte_local = snapshot_to(
67
+ "metalatte_model",
68
+ MODEL_ID,
69
+ MODEL_REV,
70
+ allow_patterns=["*.json","*.safetensors","*.bin","*.model","*.txt"],
71
  )
72
 
73
+ # Your local package
74
  metalatte_path = '.'
75
  sys.path.insert(0, metalatte_path)
76
 
 
77
  from configuration import MetaLATTEConfig
78
  from modeling_metalatte import MultitaskProteinModel
79
  AutoConfig.register("metalatte", MetaLATTEConfig)
80
  AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
81
 
82
+ # Load from the downloaded dirs (no network, no extra cache growth)
83
  tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
84
  config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
85
  model = AutoModel.from_pretrained(metalatte_local, config=config, local_files_only=True)