zhimin-z
commited on
Commit
·
9ae9289
1
Parent(s):
ce72984
refine
Browse files
app.py
CHANGED
|
@@ -24,12 +24,10 @@ load_dotenv()
|
|
| 24 |
# CONFIGURATION
|
| 25 |
# =============================================================================
|
| 26 |
|
| 27 |
-
AGENTS_REPO = "SWE-Arena/
|
| 28 |
-
AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_metadata") # Local git clone path
|
| 29 |
LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
|
| 30 |
LEADERBOARD_REPO = "SWE-Arena/leaderboard_data" # HuggingFace dataset for leaderboard data
|
| 31 |
LONGSTANDING_GAP_DAYS = 30 # Minimum days for an issue to be considered long-standing
|
| 32 |
-
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
| 33 |
MAX_RETRIES = 5
|
| 34 |
|
| 35 |
LEADERBOARD_COLUMNS = [
|
|
@@ -104,113 +102,52 @@ def validate_github_username(identifier):
|
|
| 104 |
# HUGGINGFACE DATASET OPERATIONS
|
| 105 |
# =============================================================================
|
| 106 |
|
| 107 |
-
def sync_agents_repo():
|
| 108 |
-
"""
|
| 109 |
-
Sync local bot_metadata repository with remote using git pull.
|
| 110 |
-
This is MANDATORY to ensure we have the latest bot data.
|
| 111 |
-
Raises exception if sync fails.
|
| 112 |
-
"""
|
| 113 |
-
if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
|
| 114 |
-
error_msg = f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}"
|
| 115 |
-
print(f" Error {error_msg}")
|
| 116 |
-
print(f" Please clone it first: git clone https://huggingface.co/datasets/{AGENTS_REPO}")
|
| 117 |
-
raise FileNotFoundError(error_msg)
|
| 118 |
-
|
| 119 |
-
if not os.path.exists(os.path.join(AGENTS_REPO_LOCAL_PATH, '.git')):
|
| 120 |
-
error_msg = f"{AGENTS_REPO_LOCAL_PATH} exists but is not a git repository"
|
| 121 |
-
print(f" Error {error_msg}")
|
| 122 |
-
raise ValueError(error_msg)
|
| 123 |
-
|
| 124 |
-
try:
|
| 125 |
-
# Run git pull with extended timeout due to large repository
|
| 126 |
-
result = subprocess.run(
|
| 127 |
-
['git', 'pull'],
|
| 128 |
-
cwd=AGENTS_REPO_LOCAL_PATH,
|
| 129 |
-
capture_output=True,
|
| 130 |
-
text=True,
|
| 131 |
-
timeout=GIT_SYNC_TIMEOUT
|
| 132 |
-
)
|
| 133 |
-
|
| 134 |
-
if result.returncode == 0:
|
| 135 |
-
output = result.stdout.strip()
|
| 136 |
-
if "Already up to date" in output or "Already up-to-date" in output:
|
| 137 |
-
print(f" Success Repository is up to date")
|
| 138 |
-
else:
|
| 139 |
-
print(f" Success Repository synced successfully")
|
| 140 |
-
if output:
|
| 141 |
-
# Print first few lines of output
|
| 142 |
-
lines = output.split('\n')[:5]
|
| 143 |
-
for line in lines:
|
| 144 |
-
print(f" {line}")
|
| 145 |
-
return True
|
| 146 |
-
else:
|
| 147 |
-
error_msg = f"Git pull failed: {result.stderr.strip()}"
|
| 148 |
-
print(f" Error {error_msg}")
|
| 149 |
-
raise RuntimeError(error_msg)
|
| 150 |
-
|
| 151 |
-
except subprocess.TimeoutExpired:
|
| 152 |
-
error_msg = f"Git pull timed out after {GIT_SYNC_TIMEOUT} seconds"
|
| 153 |
-
print(f" Error {error_msg}")
|
| 154 |
-
raise TimeoutError(error_msg)
|
| 155 |
-
except (FileNotFoundError, ValueError, RuntimeError, TimeoutError):
|
| 156 |
-
raise # Re-raise expected exceptions
|
| 157 |
-
except Exception as e:
|
| 158 |
-
error_msg = f"Error syncing repository: {str(e)}"
|
| 159 |
-
print(f" Error {error_msg}")
|
| 160 |
-
raise RuntimeError(error_msg) from e
|
| 161 |
-
|
| 162 |
-
|
| 163 |
def load_agents_from_hf():
|
| 164 |
-
"""
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
# MANDATORY: Sync with remote first to get latest bot data
|
| 169 |
-
print(f" Syncing bot_metadata repository to get latest assistants...")
|
| 170 |
-
sync_agents_repo() # Will raise exception if sync fails
|
| 171 |
-
|
| 172 |
-
assistants = []
|
| 173 |
-
|
| 174 |
-
# Scan local directory for JSON files
|
| 175 |
-
if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
|
| 176 |
-
raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
|
| 177 |
-
|
| 178 |
-
# Walk through the directory to find all JSON files
|
| 179 |
-
files_processed = 0
|
| 180 |
-
print(f" Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
|
| 181 |
-
|
| 182 |
-
for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
|
| 183 |
-
# Skip .git directory
|
| 184 |
-
if '.git' in root:
|
| 185 |
-
continue
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
continue
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
|
|
|
|
|
|
| 194 |
try:
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
agent_data = json.load(f)
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
agent_data['github_identifier'] = github_identifier
|
| 205 |
|
| 206 |
-
|
| 207 |
|
| 208 |
except Exception as e:
|
| 209 |
-
print(f"
|
| 210 |
continue
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
|
| 216 |
def get_hf_token():
|
|
|
|
| 24 |
# CONFIGURATION
|
| 25 |
# =============================================================================
|
| 26 |
|
| 27 |
+
AGENTS_REPO = "SWE-Arena/bot_data" # HuggingFace dataset for assistant metadata
|
|
|
|
| 28 |
LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
|
| 29 |
LEADERBOARD_REPO = "SWE-Arena/leaderboard_data" # HuggingFace dataset for leaderboard data
|
| 30 |
LONGSTANDING_GAP_DAYS = 30 # Minimum days for an issue to be considered long-standing
|
|
|
|
| 31 |
MAX_RETRIES = 5
|
| 32 |
|
| 33 |
LEADERBOARD_COLUMNS = [
|
|
|
|
| 102 |
# HUGGINGFACE DATASET OPERATIONS
|
| 103 |
# =============================================================================
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def load_agents_from_hf():
|
| 106 |
+
"""Load all assistant metadata JSON files from HuggingFace dataset."""
|
| 107 |
+
try:
|
| 108 |
+
api = HfApi()
|
| 109 |
+
assistants = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
# List all files in the repository
|
| 112 |
+
files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
|
|
|
|
| 113 |
|
| 114 |
+
# Filter for JSON files only
|
| 115 |
+
json_files = [f for f in files if f.endswith('.json')]
|
| 116 |
|
| 117 |
+
# Download and parse each JSON file
|
| 118 |
+
for json_file in json_files:
|
| 119 |
try:
|
| 120 |
+
file_path = hf_hub_download_with_backoff(
|
| 121 |
+
repo_id=AGENTS_REPO,
|
| 122 |
+
filename=json_file,
|
| 123 |
+
repo_type="dataset"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
with open(file_path, 'r') as f:
|
| 127 |
agent_data = json.load(f)
|
| 128 |
|
| 129 |
+
# Only process assistants with status == "active"
|
| 130 |
+
if agent_data.get('status') != 'active':
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Extract github_identifier from filename (e.g., "assistant[bot].json" -> "assistant[bot]")
|
| 134 |
+
filename_identifier = json_file.replace('.json', '')
|
| 135 |
|
| 136 |
+
# Add or override github_identifier to match filename
|
| 137 |
+
agent_data['github_identifier'] = filename_identifier
|
|
|
|
| 138 |
|
| 139 |
+
assistants.append(agent_data)
|
| 140 |
|
| 141 |
except Exception as e:
|
| 142 |
+
print(f"Warning: Could not load {json_file}: {str(e)}")
|
| 143 |
continue
|
| 144 |
|
| 145 |
+
print(f"Loaded {len(assistants)} assistants from HuggingFace")
|
| 146 |
+
return assistants
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"Could not load assistants from HuggingFace: {str(e)}")
|
| 150 |
+
return None
|
| 151 |
|
| 152 |
|
| 153 |
def get_hf_token():
|
msr.py
CHANGED
|
@@ -29,8 +29,8 @@ load_dotenv()
|
|
| 29 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 30 |
BASE_DIR = os.path.dirname(SCRIPT_DIR) # Parent directory
|
| 31 |
|
| 32 |
-
AGENTS_REPO = "SWE-Arena/
|
| 33 |
-
AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "
|
| 34 |
DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
|
| 35 |
GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
|
| 36 |
LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
|
|
@@ -881,7 +881,7 @@ def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 881 |
|
| 882 |
def sync_agents_repo():
|
| 883 |
"""
|
| 884 |
-
Sync local
|
| 885 |
This is MANDATORY to ensure we have the latest bot data.
|
| 886 |
Raises exception if sync fails.
|
| 887 |
"""
|
|
@@ -941,7 +941,7 @@ def load_agents_from_hf():
|
|
| 941 |
ALWAYS syncs with remote first to ensure we have the latest bot data.
|
| 942 |
"""
|
| 943 |
# MANDATORY: Sync with remote first to get latest bot data
|
| 944 |
-
print(f" Syncing
|
| 945 |
sync_agents_repo() # Will raise exception if sync fails
|
| 946 |
|
| 947 |
assistants = []
|
|
@@ -1180,8 +1180,8 @@ def construct_leaderboard_from_metadata(all_metadata_dict, assistants, wanted_re
|
|
| 1180 |
identifier = assistant.get('github_identifier')
|
| 1181 |
agent_name = assistant.get('name', 'Unknown')
|
| 1182 |
|
| 1183 |
-
|
| 1184 |
-
stats = calculate_issue_stats_from_metadata(
|
| 1185 |
|
| 1186 |
# Add wanted issues count
|
| 1187 |
resolved_wanted = len(wanted_resolved_dict.get(identifier, []))
|
|
|
|
| 29 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 30 |
BASE_DIR = os.path.dirname(SCRIPT_DIR) # Parent directory
|
| 31 |
|
| 32 |
+
AGENTS_REPO = "SWE-Arena/bot_data"
|
| 33 |
+
AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_data") # Local git clone path
|
| 34 |
DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
|
| 35 |
GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
|
| 36 |
LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
|
|
|
|
| 881 |
|
| 882 |
def sync_agents_repo():
|
| 883 |
"""
|
| 884 |
+
Sync local bot_data repository with remote using git pull.
|
| 885 |
This is MANDATORY to ensure we have the latest bot data.
|
| 886 |
Raises exception if sync fails.
|
| 887 |
"""
|
|
|
|
| 941 |
ALWAYS syncs with remote first to ensure we have the latest bot data.
|
| 942 |
"""
|
| 943 |
# MANDATORY: Sync with remote first to get latest bot data
|
| 944 |
+
print(f" Syncing bot_data repository to get latest assistants...")
|
| 945 |
sync_agents_repo() # Will raise exception if sync fails
|
| 946 |
|
| 947 |
assistants = []
|
|
|
|
| 1180 |
identifier = assistant.get('github_identifier')
|
| 1181 |
agent_name = assistant.get('name', 'Unknown')
|
| 1182 |
|
| 1183 |
+
bot_data = all_metadata_dict.get(identifier, [])
|
| 1184 |
+
stats = calculate_issue_stats_from_metadata(bot_data)
|
| 1185 |
|
| 1186 |
# Add wanted issues count
|
| 1187 |
resolved_wanted = len(wanted_resolved_dict.get(identifier, []))
|