Wauplin HF staff commited on
Commit
26a1157
1 Parent(s): 8be82f3

run garbage collector to delete empty models periodically

Browse files
Files changed (2) hide show
  1. app.py +20 -1
  2. clean_community_org.py +36 -0
app.py CHANGED
@@ -3,6 +3,8 @@ import pathlib
3
  import random
4
  import string
5
  import tempfile
 
 
6
  from typing import Iterable, List
7
 
8
  import gradio as gr
@@ -12,6 +14,8 @@ import yaml
12
  from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
13
  from mergekit.config import MergeConfiguration
14
 
 
 
15
  has_gpu = torch.cuda.is_available()
16
 
17
  # Running directly from Python doesn't work well with Gradio+run_process because of:
@@ -164,7 +168,7 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
164
  return
165
 
166
  # Set tmp HF_HOME to avoid filling up disk Space
167
- tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
168
  tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
169
  yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
170
 
@@ -215,4 +219,19 @@ with gr.Blocks() as demo:
215
 
216
  button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  demo.queue(default_concurrency_limit=1).launch()
 
3
  import random
4
  import string
5
  import tempfile
6
+ import time
7
+ from concurrent.futures import ThreadPoolExecutor
8
  from typing import Iterable, List
9
 
10
  import gradio as gr
 
14
  from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
15
  from mergekit.config import MergeConfiguration
16
 
17
+ from clean_community_org import garbage_collect_empty_models
18
+
19
  has_gpu = torch.cuda.is_available()
20
 
21
  # Running directly from Python doesn't work well with Gradio+run_process because of:
 
168
  return
169
 
170
  # Set tmp HF_HOME to avoid filling up disk Space
171
+ tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
172
  tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
173
  yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
174
 
 
219
 
220
  button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
221
 
222
+
223
+ # Run garbage collection every hour to keep the community org clean.
224
+ # Empty models might exists if the merge fails abruptly (e.g. if user leaves the Space).
225
+ def _garbage_collect_every_hour():
226
+ while True:
227
+ try:
228
+ garbage_collect_empty_models(token=COMMUNITY_HF_TOKEN)
229
+ except Exception as e:
230
+ print("Error running garbage collection", e)
231
+ time.sleep(3600)
232
+
233
+
234
+ pool = ThreadPoolExecutor()
235
+ pool.submit(_garbage_collect_every_hour)
236
+
237
  demo.queue(default_concurrency_limit=1).launch()
clean_community_org.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Script to delete empty models from the community org.
2
+ # Can be run manually or scheduled to run periodically in the Space.
3
+ # Usage: python clean_community_org.py
4
+ #
5
+ # 1. List models from https://huggingface.co/mergekit-community
6
+ # 2. Filter out models with no files.
7
+ # 3. Filter out models that are newer than 1 hour.
8
+ # 4. Delete the remaining models.
9
+ from datetime import datetime, timezone
10
+
11
+ from huggingface_hub import HfApi
12
+
13
+
14
+ def garbage_collect_empty_models(token: str | None = None):
15
+ api = HfApi(token=token)
16
+ now = datetime.now(timezone.utc)
17
+ print("Running garbage collection on mergekit-community.")
18
+ for model in api.list_models(author="mergekit-community", full=True):
19
+ if model.siblings and len(model.siblings) > 1:
20
+ # If model has files, then it's not empty
21
+ print("Skipping", model.modelId, "(not empty)")
22
+ continue
23
+ if (now - model.last_modified).total_seconds() < 3600:
24
+ # If model was updated in the last hour, then keep it
25
+ # to avoid deleting models that are being uploaded
26
+ print("Skipping", model.modelId, "(recently updated)")
27
+ continue
28
+ try:
29
+ print(f"Deleting {model.modelId}")
30
+ api.delete_repo(model.modelId, missing_ok=True)
31
+ except Exception as e:
32
+ print(f"Error deleting {model.modelId}: {e}")
33
+
34
+
35
+ if __name__ == "__main__":
36
+ garbage_collect_empty_models()