zhimin-z commited on
Commit
9ae9289
·
1 Parent(s): ce72984
Files changed (2) hide show
  1. app.py +34 -97
  2. msr.py +6 -6
app.py CHANGED
@@ -24,12 +24,10 @@ load_dotenv()
24
  # CONFIGURATION
25
  # =============================================================================
26
 
27
- AGENTS_REPO = "SWE-Arena/bot_metadata" # HuggingFace dataset for assistant metadata
28
- AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_metadata") # Local git clone path
29
  LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
30
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_data" # HuggingFace dataset for leaderboard data
31
  LONGSTANDING_GAP_DAYS = 30 # Minimum days for an issue to be considered long-standing
32
- GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
33
  MAX_RETRIES = 5
34
 
35
  LEADERBOARD_COLUMNS = [
@@ -104,113 +102,52 @@ def validate_github_username(identifier):
104
  # HUGGINGFACE DATASET OPERATIONS
105
  # =============================================================================
106
 
107
- def sync_agents_repo():
108
- """
109
- Sync local bot_metadata repository with remote using git pull.
110
- This is MANDATORY to ensure we have the latest bot data.
111
- Raises exception if sync fails.
112
- """
113
- if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
114
- error_msg = f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}"
115
- print(f" Error {error_msg}")
116
- print(f" Please clone it first: git clone https://huggingface.co/datasets/{AGENTS_REPO}")
117
- raise FileNotFoundError(error_msg)
118
-
119
- if not os.path.exists(os.path.join(AGENTS_REPO_LOCAL_PATH, '.git')):
120
- error_msg = f"{AGENTS_REPO_LOCAL_PATH} exists but is not a git repository"
121
- print(f" Error {error_msg}")
122
- raise ValueError(error_msg)
123
-
124
- try:
125
- # Run git pull with extended timeout due to large repository
126
- result = subprocess.run(
127
- ['git', 'pull'],
128
- cwd=AGENTS_REPO_LOCAL_PATH,
129
- capture_output=True,
130
- text=True,
131
- timeout=GIT_SYNC_TIMEOUT
132
- )
133
-
134
- if result.returncode == 0:
135
- output = result.stdout.strip()
136
- if "Already up to date" in output or "Already up-to-date" in output:
137
- print(f" Success Repository is up to date")
138
- else:
139
- print(f" Success Repository synced successfully")
140
- if output:
141
- # Print first few lines of output
142
- lines = output.split('\n')[:5]
143
- for line in lines:
144
- print(f" {line}")
145
- return True
146
- else:
147
- error_msg = f"Git pull failed: {result.stderr.strip()}"
148
- print(f" Error {error_msg}")
149
- raise RuntimeError(error_msg)
150
-
151
- except subprocess.TimeoutExpired:
152
- error_msg = f"Git pull timed out after {GIT_SYNC_TIMEOUT} seconds"
153
- print(f" Error {error_msg}")
154
- raise TimeoutError(error_msg)
155
- except (FileNotFoundError, ValueError, RuntimeError, TimeoutError):
156
- raise # Re-raise expected exceptions
157
- except Exception as e:
158
- error_msg = f"Error syncing repository: {str(e)}"
159
- print(f" Error {error_msg}")
160
- raise RuntimeError(error_msg) from e
161
-
162
-
163
  def load_agents_from_hf():
164
- """
165
- Load all assistant metadata JSON files from local git repository.
166
- ALWAYS syncs with remote first to ensure we have the latest bot data.
167
- """
168
- # MANDATORY: Sync with remote first to get latest bot data
169
- print(f" Syncing bot_metadata repository to get latest assistants...")
170
- sync_agents_repo() # Will raise exception if sync fails
171
-
172
- assistants = []
173
-
174
- # Scan local directory for JSON files
175
- if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
176
- raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
177
-
178
- # Walk through the directory to find all JSON files
179
- files_processed = 0
180
- print(f" Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
181
-
182
- for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
183
- # Skip .git directory
184
- if '.git' in root:
185
- continue
186
 
187
- for filename in files:
188
- if not filename.endswith('.json'):
189
- continue
190
 
191
- files_processed += 1
192
- file_path = os.path.join(root, filename)
193
 
 
 
194
  try:
195
- with open(file_path, 'r', encoding='utf-8') as f:
 
 
 
 
 
 
196
  agent_data = json.load(f)
197
 
198
- # Only include active assistants
199
- if agent_data.get('status') != 'active':
200
- continue
 
 
 
201
 
202
- # Extract github_identifier from filename
203
- github_identifier = filename.replace('.json', '')
204
- agent_data['github_identifier'] = github_identifier
205
 
206
- assistants.append(agent_data)
207
 
208
  except Exception as e:
209
- print(f" Warning Error loading {filename}: {str(e)}")
210
  continue
211
 
212
- print(f" Success Loaded {len(assistants)} active assistants (from {files_processed} total files)")
213
- return assistants
 
 
 
 
214
 
215
 
216
  def get_hf_token():
 
24
  # CONFIGURATION
25
  # =============================================================================
26
 
27
+ AGENTS_REPO = "SWE-Arena/bot_data" # HuggingFace dataset for assistant metadata
 
28
  LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
29
  LEADERBOARD_REPO = "SWE-Arena/leaderboard_data" # HuggingFace dataset for leaderboard data
30
  LONGSTANDING_GAP_DAYS = 30 # Minimum days for an issue to be considered long-standing
 
31
  MAX_RETRIES = 5
32
 
33
  LEADERBOARD_COLUMNS = [
 
102
  # HUGGINGFACE DATASET OPERATIONS
103
  # =============================================================================
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def load_agents_from_hf():
106
+ """Load all assistant metadata JSON files from HuggingFace dataset."""
107
+ try:
108
+ api = HfApi()
109
+ assistants = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # List all files in the repository
112
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
 
113
 
114
+ # Filter for JSON files only
115
+ json_files = [f for f in files if f.endswith('.json')]
116
 
117
+ # Download and parse each JSON file
118
+ for json_file in json_files:
119
  try:
120
+ file_path = hf_hub_download_with_backoff(
121
+ repo_id=AGENTS_REPO,
122
+ filename=json_file,
123
+ repo_type="dataset"
124
+ )
125
+
126
+ with open(file_path, 'r') as f:
127
  agent_data = json.load(f)
128
 
129
+ # Only process assistants with status == "active"
130
+ if agent_data.get('status') != 'active':
131
+ continue
132
+
133
+ # Extract github_identifier from filename (e.g., "assistant[bot].json" -> "assistant[bot]")
134
+ filename_identifier = json_file.replace('.json', '')
135
 
136
+ # Add or override github_identifier to match filename
137
+ agent_data['github_identifier'] = filename_identifier
 
138
 
139
+ assistants.append(agent_data)
140
 
141
  except Exception as e:
142
+ print(f"Warning: Could not load {json_file}: {str(e)}")
143
  continue
144
 
145
+ print(f"Loaded {len(assistants)} assistants from HuggingFace")
146
+ return assistants
147
+
148
+ except Exception as e:
149
+ print(f"Could not load assistants from HuggingFace: {str(e)}")
150
+ return None
151
 
152
 
153
  def get_hf_token():
msr.py CHANGED
@@ -29,8 +29,8 @@ load_dotenv()
29
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
30
  BASE_DIR = os.path.dirname(SCRIPT_DIR) # Parent directory
31
 
32
- AGENTS_REPO = "SWE-Arena/bot_metadata"
33
- AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_metadata") # Local git clone path
34
  DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
35
  GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
36
  LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
@@ -881,7 +881,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
881
 
882
  def sync_agents_repo():
883
  """
884
- Sync local bot_metadata repository with remote using git pull.
885
  This is MANDATORY to ensure we have the latest bot data.
886
  Raises exception if sync fails.
887
  """
@@ -941,7 +941,7 @@ def load_agents_from_hf():
941
  ALWAYS syncs with remote first to ensure we have the latest bot data.
942
  """
943
  # MANDATORY: Sync with remote first to get latest bot data
944
- print(f" Syncing bot_metadata repository to get latest assistants...")
945
  sync_agents_repo() # Will raise exception if sync fails
946
 
947
  assistants = []
@@ -1180,8 +1180,8 @@ def construct_leaderboard_from_metadata(all_metadata_dict, assistants, wanted_re
1180
  identifier = assistant.get('github_identifier')
1181
  agent_name = assistant.get('name', 'Unknown')
1182
 
1183
- bot_metadata = all_metadata_dict.get(identifier, [])
1184
- stats = calculate_issue_stats_from_metadata(bot_metadata)
1185
 
1186
  # Add wanted issues count
1187
  resolved_wanted = len(wanted_resolved_dict.get(identifier, []))
 
29
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
30
  BASE_DIR = os.path.dirname(SCRIPT_DIR) # Parent directory
31
 
32
+ AGENTS_REPO = "SWE-Arena/bot_data"
33
+ AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_data") # Local git clone path
34
  DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
35
  GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
36
  LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
 
881
 
882
  def sync_agents_repo():
883
  """
884
+ Sync local bot_data repository with remote using git pull.
885
  This is MANDATORY to ensure we have the latest bot data.
886
  Raises exception if sync fails.
887
  """
 
941
  ALWAYS syncs with remote first to ensure we have the latest bot data.
942
  """
943
  # MANDATORY: Sync with remote first to get latest bot data
944
+ print(f" Syncing bot_data repository to get latest assistants...")
945
  sync_agents_repo() # Will raise exception if sync fails
946
 
947
  assistants = []
 
1180
  identifier = assistant.get('github_identifier')
1181
  agent_name = assistant.get('name', 'Unknown')
1182
 
1183
+ bot_data = all_metadata_dict.get(identifier, [])
1184
+ stats = calculate_issue_stats_from_metadata(bot_data)
1185
 
1186
  # Add wanted issues count
1187
  resolved_wanted = len(wanted_resolved_dict.get(identifier, []))