zhiminy commited on
Commit
e9034e9
·
1 Parent(s): adbc63a
Files changed (2) hide show
  1. app.py +70 -10
  2. msr.py +64 -4
app.py CHANGED
@@ -8,6 +8,7 @@ import requests
8
  from datetime import datetime, timezone, timedelta
9
  from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
 
11
  from dotenv import load_dotenv
12
  import pandas as pd
13
  import random
@@ -16,6 +17,7 @@ from plotly.subplots import make_subplots
16
  from apscheduler.schedulers.background import BackgroundScheduler
17
  from apscheduler.triggers.cron import CronTrigger
18
  from google.cloud import bigquery
 
19
 
20
  # Load environment variables
21
  load_dotenv()
@@ -38,6 +40,62 @@ LEADERBOARD_COLUMNS = [
38
  ("Resolved Rate (%)", "number"),
39
  ]
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # =============================================================================
42
  # JSONL FILE OPERATIONS
43
  # =============================================================================
@@ -727,7 +785,8 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
727
  # Upload entire folder using upload_large_folder (optimized for large files)
728
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
729
  print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
730
- api.upload_large_folder(
 
731
  folder_path=temp_dir,
732
  repo_id=ISSUE_METADATA_REPO,
733
  repo_type="dataset"
@@ -764,7 +823,7 @@ def load_issue_metadata():
764
  token = get_hf_token()
765
 
766
  # List all files in the repository
767
- files = api.list_repo_files(repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
768
 
769
  # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
770
  # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
@@ -803,7 +862,7 @@ def load_issue_metadata():
803
 
804
  agent_identifier = parts[0]
805
 
806
- file_path = hf_hub_download(
807
  repo_id=ISSUE_METADATA_REPO,
808
  filename=filename,
809
  repo_type="dataset",
@@ -856,7 +915,7 @@ def get_latest_issue_date_for_agent(agent_identifier):
856
  token = get_hf_token()
857
 
858
  # List all files in the repository
859
- files = api.list_repo_files(repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
860
 
861
  # Filter for files in this agent's folder
862
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
@@ -870,7 +929,7 @@ def get_latest_issue_date_for_agent(agent_identifier):
870
  latest_date = None
871
  for filename in agent_files:
872
  try:
873
- file_path = hf_hub_download(
874
  repo_id=ISSUE_METADATA_REPO,
875
  filename=filename,
876
  repo_type="dataset",
@@ -915,7 +974,7 @@ def get_daily_files_last_time_frame(agent_identifier):
915
  cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
916
 
917
  # List all files in the repository
918
- files = api.list_repo_files(repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
919
 
920
  # Filter for files in this agent's folder
921
  agent_pattern = f"{agent_identifier}/"
@@ -962,7 +1021,7 @@ def load_agents_from_hf():
962
  agents = []
963
 
964
  # List all files in the repository
965
- files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
966
 
967
  # Filter for JSON files only
968
  json_files = [f for f in files if f.endswith('.json')]
@@ -970,7 +1029,7 @@ def load_agents_from_hf():
970
  # Download and parse each JSON file
971
  for json_file in json_files:
972
  try:
973
- file_path = hf_hub_download(
974
  repo_id=AGENTS_REPO,
975
  filename=json_file,
976
  repo_type="dataset"
@@ -1034,7 +1093,7 @@ def load_cached_leaderboard_and_metrics():
1034
  print("📥 Loading cached leaderboard and metrics from HuggingFace...")
1035
 
1036
  # Download cached file
1037
- cached_path = hf_hub_download(
1038
  repo_id=LEADERBOARD_REPO,
1039
  filename="swe-issue.json",
1040
  repo_type="dataset",
@@ -1194,7 +1253,8 @@ def save_leaderboard_and_metrics_to_hf():
1194
 
1195
  # Upload to HuggingFace (will overwrite if exists)
1196
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
1197
- api.upload_file(
 
1198
  path_or_fileobj=file_like_object,
1199
  path_in_repo="swe-issue.json",
1200
  repo_id=LEADERBOARD_REPO,
 
8
  from datetime import datetime, timezone, timedelta
9
  from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
11
+ from huggingface_hub.errors import HfHubHTTPError
12
  from dotenv import load_dotenv
13
  import pandas as pd
14
  import random
 
17
  from apscheduler.schedulers.background import BackgroundScheduler
18
  from apscheduler.triggers.cron import CronTrigger
19
  from google.cloud import bigquery
20
+ import backoff
21
 
22
  # Load environment variables
23
  load_dotenv()
 
40
  ("Resolved Rate (%)", "number"),
41
  ]
42
 
43
+ # =============================================================================
44
+ # HUGGINGFACE API WRAPPERS WITH BACKOFF
45
+ # =============================================================================
46
+
47
+ def is_rate_limit_error(e):
48
+ """Check if the exception is a rate limit error (429)."""
49
+ return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
50
+
51
+ @backoff.on_exception(
52
+ backoff.expo,
53
+ HfHubHTTPError,
54
+ giveup=lambda e: not is_rate_limit_error(e),
55
+ max_tries=8,
56
+ jitter=backoff.full_jitter,
57
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
58
+ )
59
+ def upload_large_folder_with_backoff(api, **kwargs):
60
+ """Upload large folder with exponential backoff on rate limit errors."""
61
+ return api.upload_large_folder(**kwargs)
62
+
63
+ @backoff.on_exception(
64
+ backoff.expo,
65
+ HfHubHTTPError,
66
+ giveup=lambda e: not is_rate_limit_error(e),
67
+ max_tries=8,
68
+ jitter=backoff.full_jitter,
69
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
70
+ )
71
+ def list_repo_files_with_backoff(api, **kwargs):
72
+ """List repo files with exponential backoff on rate limit errors."""
73
+ return api.list_repo_files(**kwargs)
74
+
75
+ @backoff.on_exception(
76
+ backoff.expo,
77
+ HfHubHTTPError,
78
+ giveup=lambda e: not is_rate_limit_error(e),
79
+ max_tries=8,
80
+ jitter=backoff.full_jitter,
81
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
82
+ )
83
+ def hf_hub_download_with_backoff(**kwargs):
84
+ """Download from HF Hub with exponential backoff on rate limit errors."""
85
+ return hf_hub_download(**kwargs)
86
+
87
+ @backoff.on_exception(
88
+ backoff.expo,
89
+ HfHubHTTPError,
90
+ giveup=lambda e: not is_rate_limit_error(e),
91
+ max_tries=8,
92
+ jitter=backoff.full_jitter,
93
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
94
+ )
95
+ def upload_file_with_backoff(api, **kwargs):
96
+ """Upload file with exponential backoff on rate limit errors."""
97
+ return api.upload_file(**kwargs)
98
+
99
  # =============================================================================
100
  # JSONL FILE OPERATIONS
101
  # =============================================================================
 
785
  # Upload entire folder using upload_large_folder (optimized for large files)
786
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
787
  print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
788
+ upload_large_folder_with_backoff(
789
+ api,
790
  folder_path=temp_dir,
791
  repo_id=ISSUE_METADATA_REPO,
792
  repo_type="dataset"
 
823
  token = get_hf_token()
824
 
825
  # List all files in the repository
826
+ files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
827
 
828
  # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
829
  # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
 
862
 
863
  agent_identifier = parts[0]
864
 
865
+ file_path = hf_hub_download_with_backoff(
866
  repo_id=ISSUE_METADATA_REPO,
867
  filename=filename,
868
  repo_type="dataset",
 
915
  token = get_hf_token()
916
 
917
  # List all files in the repository
918
+ files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
919
 
920
  # Filter for files in this agent's folder
921
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
 
929
  latest_date = None
930
  for filename in agent_files:
931
  try:
932
+ file_path = hf_hub_download_with_backoff(
933
  repo_id=ISSUE_METADATA_REPO,
934
  filename=filename,
935
  repo_type="dataset",
 
974
  cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
975
 
976
  # List all files in the repository
977
+ files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
978
 
979
  # Filter for files in this agent's folder
980
  agent_pattern = f"{agent_identifier}/"
 
1021
  agents = []
1022
 
1023
  # List all files in the repository
1024
+ files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
1025
 
1026
  # Filter for JSON files only
1027
  json_files = [f for f in files if f.endswith('.json')]
 
1029
  # Download and parse each JSON file
1030
  for json_file in json_files:
1031
  try:
1032
+ file_path = hf_hub_download_with_backoff(
1033
  repo_id=AGENTS_REPO,
1034
  filename=json_file,
1035
  repo_type="dataset"
 
1093
  print("📥 Loading cached leaderboard and metrics from HuggingFace...")
1094
 
1095
  # Download cached file
1096
+ cached_path = hf_hub_download_with_backoff(
1097
  repo_id=LEADERBOARD_REPO,
1098
  filename="swe-issue.json",
1099
  repo_type="dataset",
 
1253
 
1254
  # Upload to HuggingFace (will overwrite if exists)
1255
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
1256
+ upload_file_with_backoff(
1257
+ api,
1258
  path_or_fileobj=file_like_object,
1259
  path_in_repo="swe-issue.json",
1260
  repo_id=LEADERBOARD_REPO,
msr.py CHANGED
@@ -9,8 +9,10 @@ import tempfile
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
11
  from huggingface_hub import HfApi, hf_hub_download
 
12
  from dotenv import load_dotenv
13
  from google.cloud import bigquery
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -24,6 +26,62 @@ ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
24
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
25
  LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # =============================================================================
28
  # UTILITY FUNCTIONS
29
  # =============================================================================
@@ -466,7 +524,8 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
466
  # Upload entire folder using upload_large_folder (optimized for large files)
467
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
468
  print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
469
- api.upload_large_folder(
 
470
  folder_path=temp_dir,
471
  repo_id=ISSUE_METADATA_REPO,
472
  repo_type="dataset"
@@ -498,7 +557,7 @@ def load_agents_from_hf():
498
  agents = []
499
 
500
  # List all files in the repository
501
- files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
502
 
503
  # Filter for JSON files only
504
  json_files = [f for f in files if f.endswith('.json')]
@@ -508,7 +567,7 @@ def load_agents_from_hf():
508
  # Download and parse each JSON file
509
  for json_file in json_files:
510
  try:
511
- file_path = hf_hub_download(
512
  repo_id=AGENTS_REPO,
513
  filename=json_file,
514
  repo_type="dataset"
@@ -736,7 +795,8 @@ def save_leaderboard_and_metrics_to_hf(all_metadata, agents):
736
 
737
  # Upload to HuggingFace (will overwrite if exists)
738
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
739
- api.upload_file(
 
740
  path_or_fileobj=file_like_object,
741
  path_in_repo="swe-issue.json",
742
  repo_id=LEADERBOARD_REPO,
 
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
11
  from huggingface_hub import HfApi, hf_hub_download
12
+ from huggingface_hub.errors import HfHubHTTPError
13
  from dotenv import load_dotenv
14
  from google.cloud import bigquery
15
+ import backoff
16
 
17
  # Load environment variables
18
  load_dotenv()
 
26
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
27
  LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
28
 
29
+ # =============================================================================
30
+ # HUGGINGFACE API WRAPPERS WITH BACKOFF
31
+ # =============================================================================
32
+
33
+ def is_rate_limit_error(e):
34
+ """Check if the exception is a rate limit error (429)."""
35
+ return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
36
+
37
+ @backoff.on_exception(
38
+ backoff.expo,
39
+ HfHubHTTPError,
40
+ giveup=lambda e: not is_rate_limit_error(e),
41
+ max_tries=8,
42
+ jitter=backoff.full_jitter,
43
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
44
+ )
45
+ def upload_large_folder_with_backoff(api, **kwargs):
46
+ """Upload large folder with exponential backoff on rate limit errors."""
47
+ return api.upload_large_folder(**kwargs)
48
+
49
+ @backoff.on_exception(
50
+ backoff.expo,
51
+ HfHubHTTPError,
52
+ giveup=lambda e: not is_rate_limit_error(e),
53
+ max_tries=8,
54
+ jitter=backoff.full_jitter,
55
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
56
+ )
57
+ def list_repo_files_with_backoff(api, **kwargs):
58
+ """List repo files with exponential backoff on rate limit errors."""
59
+ return api.list_repo_files(**kwargs)
60
+
61
+ @backoff.on_exception(
62
+ backoff.expo,
63
+ HfHubHTTPError,
64
+ giveup=lambda e: not is_rate_limit_error(e),
65
+ max_tries=8,
66
+ jitter=backoff.full_jitter,
67
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
68
+ )
69
+ def hf_hub_download_with_backoff(**kwargs):
70
+ """Download from HF Hub with exponential backoff on rate limit errors."""
71
+ return hf_hub_download(**kwargs)
72
+
73
+ @backoff.on_exception(
74
+ backoff.expo,
75
+ HfHubHTTPError,
76
+ giveup=lambda e: not is_rate_limit_error(e),
77
+ max_tries=8,
78
+ jitter=backoff.full_jitter,
79
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']:.1f}s (attempt {details['tries']}/{8})...")
80
+ )
81
+ def upload_file_with_backoff(api, **kwargs):
82
+ """Upload file with exponential backoff on rate limit errors."""
83
+ return api.upload_file(**kwargs)
84
+
85
  # =============================================================================
86
  # UTILITY FUNCTIONS
87
  # =============================================================================
 
524
  # Upload entire folder using upload_large_folder (optimized for large files)
525
  # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
526
  print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
527
+ upload_large_folder_with_backoff(
528
+ api,
529
  folder_path=temp_dir,
530
  repo_id=ISSUE_METADATA_REPO,
531
  repo_type="dataset"
 
557
  agents = []
558
 
559
  # List all files in the repository
560
+ files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
561
 
562
  # Filter for JSON files only
563
  json_files = [f for f in files if f.endswith('.json')]
 
567
  # Download and parse each JSON file
568
  for json_file in json_files:
569
  try:
570
+ file_path = hf_hub_download_with_backoff(
571
  repo_id=AGENTS_REPO,
572
  filename=json_file,
573
  repo_type="dataset"
 
795
 
796
  # Upload to HuggingFace (will overwrite if exists)
797
  print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
798
+ upload_file_with_backoff(
799
+ api,
800
  path_or_fileobj=file_like_object,
801
  path_in_repo="swe-issue.json",
802
  repo_id=LEADERBOARD_REPO,