alozowski commited on
Commit
87e47c2
1 Parent(s): 9b133aa

Updated app.py download_dataset function

Browse files
Files changed (3) hide show
  1. app.py +15 -11
  2. src/populate.py +0 -1
  3. src/tools/collections.py +1 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import logging
3
  import gradio as gr
4
  import pandas as pd
@@ -56,13 +57,12 @@ enable_space_ci()
56
  def restart_space():
57
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
58
 
59
-
60
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
61
- """Attempt to download dataset with retries."""
62
  attempt = 0
63
  while attempt < max_attempts:
64
  try:
65
- print(f"Downloading {repo_id} to {local_dir}")
66
  snapshot_download(
67
  repo_id=repo_id,
68
  local_dir=local_dir,
@@ -71,21 +71,25 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
71
  etag_timeout=30,
72
  max_workers=8,
73
  )
 
74
  return
75
  except Exception as e:
76
- logging.error(f"Error downloading {repo_id}: {e}")
 
 
77
  attempt += 1
78
- if attempt == max_attempts:
79
- restart_space()
80
-
81
 
82
  def init_space(full_init: bool = True):
83
  """Initializes the application space, loading only necessary data."""
84
  if full_init:
85
  # These downloads only occur on full initialization
86
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
87
- download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
88
- download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
 
 
 
89
 
90
  # Always retrieve the leaderboard DataFrame
91
  raw_data, original_df = get_leaderboard_df(
 
1
  import os
2
+ import time
3
  import logging
4
  import gradio as gr
5
  import pandas as pd
 
57
  def restart_space():
58
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
59
 
60
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
61
+ """Download dataset with exponential backoff retries."""
 
62
  attempt = 0
63
  while attempt < max_attempts:
64
  try:
65
+ logging.info(f"Downloading {repo_id} to {local_dir}")
66
  snapshot_download(
67
  repo_id=repo_id,
68
  local_dir=local_dir,
 
71
  etag_timeout=30,
72
  max_workers=8,
73
  )
74
+ logging.info("Download successful")
75
  return
76
  except Exception as e:
77
+ wait_time = backoff_factor ** attempt
78
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
79
+ time.sleep(wait_time)
80
  attempt += 1
81
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 
 
82
 
83
  def init_space(full_init: bool = True):
84
  """Initializes the application space, loading only necessary data."""
85
  if full_init:
86
  # These downloads only occur on full initialization
87
+ try:
88
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
89
+ download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
90
+ download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
91
+ except Exception:
92
+ restart_space()
93
 
94
  # Always retrieve the leaderboard DataFrame
95
  raw_data, original_df = get_leaderboard_df(
src/populate.py CHANGED
@@ -52,4 +52,3 @@ def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmar
52
  df = df[cols].round(decimals=2)
53
  df = df[has_no_nan_values(df, benchmark_cols)]
54
  return raw_data, df
55
-
 
52
  df = df[cols].round(decimals=2)
53
  df = df[has_no_nan_values(df, benchmark_cols)]
54
  return raw_data, df
 
src/tools/collections.py CHANGED
@@ -73,4 +73,4 @@ def update_collections(df: DataFrame):
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
- continue
 
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
+ continue