SAGE OSS Evaluator commited on
Commit
7844386
·
1 Parent(s): 6de8820
initial_sage_results.json DELETED
@@ -1,446 +0,0 @@
1
- [
2
- {
3
- "model_name": "SDLab",
4
- "organization": "SDLab",
5
- "tokens": "User Submission",
6
- "accuracy": 100.0,
7
- "mg_pass_2": 100.0,
8
- "mg_pass_4": 100.0,
9
- "submitted_time": "2025-09-08",
10
- "results": {
11
- "sage_mathematics": 100.0,
12
- "sage_physics": 100.0,
13
- "sage_chemistry": 100.0,
14
- "sage_biology": 100.0,
15
- "sage_earth_science": 0.0,
16
- "sage_astronomy": 0.0,
17
- "sage_overall": 100.0
18
- },
19
- "contact_email": "nlp2ct.shudong@gmail.com",
20
- "evaluation_timestamp": "2025-09-08T15:36:55.044438",
21
- "result_file": "results/result_SDLab_20250908_153727.json"
22
- },
23
- {
24
- "model_name": "SH AI Lab",
25
- "organization": "SH AI Lab",
26
- "tokens": "User Submission",
27
- "accuracy": 100.0,
28
- "mg_pass_2": 100.0,
29
- "mg_pass_4": 100.0,
30
- "submitted_time": "2025-09-08",
31
- "results": {
32
- "sage_mathematics": 100.0,
33
- "sage_physics": 100.0,
34
- "sage_chemistry": 100.0,
35
- "sage_biology": 100.0,
36
- "sage_earth_science": 0.0,
37
- "sage_astronomy": 0.0,
38
- "sage_overall": 100.0
39
- },
40
- "contact_email": "nlp2ct.shudong@gmail.com",
41
- "evaluation_timestamp": "2025-09-08T14:57:05.665745",
42
- "result_file": "results/result_SH_AI_Lab_20250908_145735.json"
43
- },
44
- {
45
- "model_name": "Shanghai AI Lab",
46
- "organization": "Shanghai AI Lab",
47
- "tokens": "User Submission",
48
- "accuracy": 100.0,
49
- "mg_pass_2": 100.0,
50
- "mg_pass_4": 100.0,
51
- "submitted_time": "2025-09-07",
52
- "results": {
53
- "sage_mathematics": 100.0,
54
- "sage_physics": 100.0,
55
- "sage_chemistry": 100.0,
56
- "sage_biology": 100.0,
57
- "sage_earth_science": 0.0,
58
- "sage_astronomy": 0.0,
59
- "sage_overall": 100.0
60
- },
61
- "contact_email": "nlp2ct.shudong@gmail.com",
62
- "evaluation_timestamp": "2025-09-07T01:00:50.229124",
63
- "result_file": "results/result_Shanghai_AI_Lab_20250907_010121.json"
64
- },
65
- {
66
- "model_name": "E2E_Test_Org",
67
- "organization": "E2E_Test_Org",
68
- "tokens": "User Submission",
69
- "accuracy": 100.0,
70
- "mg_pass_2": 100.0,
71
- "mg_pass_4": 100.0,
72
- "submitted_time": "2025-09-06",
73
- "results": {
74
- "sage_mathematics": 100.0,
75
- "sage_physics": 0.0,
76
- "sage_chemistry": 100.0,
77
- "sage_biology": 0.0,
78
- "sage_earth_science": 0.0,
79
- "sage_astronomy": 0.0,
80
- "sage_overall": 100.0
81
- },
82
- "contact_email": "e2e@test.com",
83
- "evaluation_timestamp": "2025-09-06T23:34:38.252926",
84
- "result_file": "results/result_E2E_Test_Org_20250906_233650.json"
85
- },
86
- {
87
- "model_name": "Test_Organization",
88
- "organization": "Test_Organization",
89
- "tokens": "User Submission",
90
- "accuracy": 100.0,
91
- "mg_pass_2": 100.0,
92
- "mg_pass_4": 100.0,
93
- "submitted_time": "2025-09-06",
94
- "results": {
95
- "sage_mathematics": 100.0,
96
- "sage_physics": 0.0,
97
- "sage_chemistry": 100.0,
98
- "sage_biology": 0.0,
99
- "sage_earth_science": 0.0,
100
- "sage_astronomy": 0.0,
101
- "sage_overall": 100.0
102
- },
103
- "contact_email": "test@example.com",
104
- "evaluation_timestamp": "2025-09-06T23:32:03.654608",
105
- "result_file": "results/result_Test_Organization_20250906_233407.json"
106
- },
107
- {
108
- "model_name": "Test_Organization",
109
- "organization": "Test_Organization",
110
- "tokens": "User Submission",
111
- "accuracy": 100.0,
112
- "mg_pass_2": 100.0,
113
- "mg_pass_4": 100.0,
114
- "submitted_time": "2025-09-06",
115
- "results": {
116
- "sage_mathematics": 100.0,
117
- "sage_physics": 0.0,
118
- "sage_chemistry": 100.0,
119
- "sage_biology": 0.0,
120
- "sage_earth_science": 0.0,
121
- "sage_astronomy": 0.0,
122
- "sage_overall": 100.0
123
- },
124
- "contact_email": "test@example.com",
125
- "evaluation_timestamp": "2025-09-06T23:29:34.267864",
126
- "result_file": "results/result_Test_Organization_20250906_233132.json"
127
- },
128
- {
129
- "model_name": "Unknown",
130
- "organization": "Unknown",
131
- "tokens": "User Submission",
132
- "accuracy": 100.0,
133
- "mg_pass_2": 100.0,
134
- "mg_pass_4": 100.0,
135
- "submitted_time": "2025-09-05",
136
- "results": {
137
- "sage_mathematics": 100.0,
138
- "sage_physics": 100.0,
139
- "sage_chemistry": 100.0,
140
- "sage_biology": 100.0,
141
- "sage_earth_science": 0.0,
142
- "sage_astronomy": 0.0,
143
- "sage_overall": 100.0
144
- },
145
- "contact_email": "",
146
- "evaluation_timestamp": "2025-09-05T16:14:32.476871",
147
- "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
148
- },
149
- {
150
- "model_name": "OpenAI GPT-5-High",
151
- "organization": "OpenAI",
152
- "tokens": "64k",
153
- "accuracy": 45.2,
154
- "mg_pass_2": 36.6,
155
- "mg_pass_4": 35.1,
156
- "submitted_time": "2024-01-15",
157
- "results": {
158
- "sage_overall": 45.2,
159
- "sage_math": 48.5,
160
- "sage_physics": 44.1,
161
- "sage_chemistry": 42.8,
162
- "sage_biology": 46.3,
163
- "sage_earth_science": 43.7,
164
- "sage_astronomy": 45.8
165
- }
166
- },
167
- {
168
- "model_name": "Gemini-2.5-Pro",
169
- "organization": "Google",
170
- "tokens": "64k",
171
- "accuracy": 40.5,
172
- "mg_pass_2": 31.2,
173
- "mg_pass_4": 29.7,
174
- "submitted_time": "2024-01-14",
175
- "results": {
176
- "sage_overall": 40.5,
177
- "sage_math": 43.2,
178
- "sage_physics": 39.8,
179
- "sage_chemistry": 38.1,
180
- "sage_biology": 41.7,
181
- "sage_earth_science": 39.4,
182
- "sage_astronomy": 40.8
183
- }
184
- },
185
- {
186
- "model_name": "OpenAI o3-High",
187
- "organization": "OpenAI",
188
- "tokens": "64k",
189
- "accuracy": 39.6,
190
- "mg_pass_2": 26.0,
191
- "mg_pass_4": 27.3,
192
- "submitted_time": "2024-01-13",
193
- "results": {
194
- "sage_overall": 39.6,
195
- "sage_math": 42.1,
196
- "sage_physics": 38.5,
197
- "sage_chemistry": 37.2,
198
- "sage_biology": 40.8,
199
- "sage_earth_science": 38.1,
200
- "sage_astronomy": 40.9
201
- }
202
- },
203
- {
204
- "model_name": "Gemini-2.5-Pro",
205
- "organization": "Google",
206
- "tokens": "32k",
207
- "accuracy": 39.1,
208
- "mg_pass_2": 29.4,
209
- "mg_pass_4": 27.5,
210
- "submitted_time": "2024-01-12",
211
- "results": {
212
- "sage_overall": 39.1,
213
- "sage_math": 41.8,
214
- "sage_physics": 38.2,
215
- "sage_chemistry": 36.9,
216
- "sage_biology": 40.3,
217
- "sage_earth_science": 37.7,
218
- "sage_astronomy": 39.7
219
- }
220
- },
221
- {
222
- "model_name": "OpenAI o3-High",
223
- "organization": "OpenAI",
224
- "tokens": "32k",
225
- "accuracy": 38.5,
226
- "mg_pass_2": 26.4,
227
- "mg_pass_4": 24.2,
228
- "submitted_time": "2024-01-11",
229
- "results": {
230
- "sage_overall": 38.5,
231
- "sage_math": 41.2,
232
- "sage_physics": 37.8,
233
- "sage_chemistry": 36.1,
234
- "sage_biology": 39.9,
235
- "sage_earth_science": 37.3,
236
- "sage_astronomy": 38.7
237
- }
238
- },
239
- {
240
- "model_name": "Grok-4",
241
- "organization": "xAI",
242
- "tokens": "32k",
243
- "accuracy": 35.0,
244
- "mg_pass_2": 26.0,
245
- "mg_pass_4": 24.1,
246
- "submitted_time": "2024-01-10",
247
- "results": {
248
- "sage_overall": 35.0,
249
- "sage_math": 37.5,
250
- "sage_physics": 34.2,
251
- "sage_chemistry": 33.1,
252
- "sage_biology": 36.1,
253
- "sage_earth_science": 34.8,
254
- "sage_astronomy": 34.3
255
- }
256
- },
257
- {
258
- "model_name": "Qwen3-235B-A22B-2507",
259
- "organization": "Alibaba",
260
- "tokens": "32k",
261
- "accuracy": 27.8,
262
- "mg_pass_2": 19.8,
263
- "mg_pass_4": 18.1,
264
- "submitted_time": "2024-01-09",
265
- "results": {
266
- "sage_overall": 27.8,
267
- "sage_math": 29.8,
268
- "sage_physics": 27.1,
269
- "sage_chemistry": 26.5,
270
- "sage_biology": 28.4,
271
- "sage_earth_science": 27.9,
272
- "sage_astronomy": 27.1
273
- }
274
- },
275
- {
276
- "model_name": "Doubao-Seed-1.6-thinking",
277
- "organization": "ByteDance",
278
- "tokens": "32k",
279
- "accuracy": 27.7,
280
- "mg_pass_2": 18.4,
281
- "mg_pass_4": 16.8,
282
- "submitted_time": "2024-01-08",
283
- "results": {
284
- "sage_overall": 27.7,
285
- "sage_math": 29.6,
286
- "sage_physics": 27.0,
287
- "sage_chemistry": 26.3,
288
- "sage_biology": 28.2,
289
- "sage_earth_science": 27.7,
290
- "sage_astronomy": 27.4
291
- }
292
- },
293
- {
294
- "model_name": "DeepSeek-V3.1",
295
- "organization": "DeepSeek",
296
- "tokens": "64k",
297
- "accuracy": 27.7,
298
- "mg_pass_2": 18.3,
299
- "mg_pass_4": 16.5,
300
- "submitted_time": "2024-01-07",
301
- "results": {
302
- "sage_overall": 27.7,
303
- "sage_math": 29.5,
304
- "sage_physics": 26.9,
305
- "sage_chemistry": 26.2,
306
- "sage_biology": 28.1,
307
- "sage_earth_science": 27.6,
308
- "sage_astronomy": 27.9
309
- }
310
- },
311
- {
312
- "model_name": "DeepSeek-R1-0528",
313
- "organization": "DeepSeek",
314
- "tokens": "32k",
315
- "accuracy": 26.1,
316
- "mg_pass_2": 16.0,
317
- "mg_pass_4": 14.1,
318
- "submitted_time": "2024-01-06",
319
- "results": {
320
- "sage_overall": 26.1,
321
- "sage_math": 28.0,
322
- "sage_physics": 25.4,
323
- "sage_chemistry": 24.8,
324
- "sage_biology": 26.7,
325
- "sage_earth_science": 26.2,
326
- "sage_astronomy": 25.5
327
- }
328
- },
329
- {
330
- "model_name": "OpenAI o4-mini",
331
- "organization": "OpenAI",
332
- "tokens": "32k",
333
- "accuracy": 23.5,
334
- "mg_pass_2": 13.7,
335
- "mg_pass_4": 11.9,
336
- "submitted_time": "2024-01-05",
337
- "results": {
338
- "sage_overall": 23.5,
339
- "sage_math": 25.2,
340
- "sage_physics": 22.8,
341
- "sage_chemistry": 22.1,
342
- "sage_biology": 24.1,
343
- "sage_earth_science": 23.6,
344
- "sage_astronomy": 23.2
345
- }
346
- },
347
- {
348
- "model_name": "Qwen3-235B-A22B",
349
- "organization": "Alibaba",
350
- "tokens": "32k",
351
- "accuracy": 20.1,
352
- "mg_pass_2": 11.2,
353
- "mg_pass_4": 9.6,
354
- "submitted_time": "2024-01-04",
355
- "results": {
356
- "sage_overall": 20.1,
357
- "sage_math": 21.5,
358
- "sage_physics": 19.5,
359
- "sage_chemistry": 19.2,
360
- "sage_biology": 20.7,
361
- "sage_earth_science": 20.3,
362
- "sage_astronomy": 19.4
363
- }
364
- },
365
- {
366
- "model_name": "GLM-4.5-Thinking",
367
- "organization": "Zhipu AI",
368
- "tokens": "64k",
369
- "accuracy": 9.3,
370
- "mg_pass_2": 4.7,
371
- "mg_pass_4": 4.0,
372
- "submitted_time": "2024-01-03",
373
- "results": {
374
- "sage_overall": 9.3,
375
- "sage_math": 10.1,
376
- "sage_physics": 9.0,
377
- "sage_chemistry": 8.7,
378
- "sage_biology": 9.6,
379
- "sage_earth_science": 9.2,
380
- "sage_astronomy": 9.2
381
- }
382
- },
383
- {
384
- "model_name": "QuickDemo_TestOrg",
385
- "organization": "QuickDemo_TestOrg",
386
- "tokens": "User Submission (Simulated)",
387
- "accuracy": 0.619,
388
- "mg_pass_2": 0.619,
389
- "mg_pass_4": 0.619,
390
- "submitted_time": "2025-09-05",
391
- "results": {
392
- "sage_mathematics": 0.877,
393
- "sage_physics": 0.895,
394
- "sage_chemistry": 0.756,
395
- "sage_biology": 0.316,
396
- "sage_earth_science": 0.312,
397
- "sage_astronomy": 0.56,
398
- "sage_overall": 0.619
399
- },
400
- "contact_email": "test@demo.com",
401
- "evaluation_timestamp": "2025-09-05T16:19:39.864071",
402
- "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
403
- },
404
- {
405
- "model_name": "QuickDemo_HighAccuracy",
406
- "organization": "QuickDemo_HighAccuracy",
407
- "tokens": "User Submission (Simulated)",
408
- "accuracy": 0.598,
409
- "mg_pass_2": 0.598,
410
- "mg_pass_4": 0.598,
411
- "submitted_time": "2025-09-05",
412
- "results": {
413
- "sage_mathematics": 0.88,
414
- "sage_physics": 0.331,
415
- "sage_chemistry": 0.646,
416
- "sage_biology": 0.501,
417
- "sage_earth_science": 0.818,
418
- "sage_astronomy": 0.415,
419
- "sage_overall": 0.598
420
- },
421
- "contact_email": "high@demo.com",
422
- "evaluation_timestamp": "2025-09-05T16:19:43.874748",
423
- "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
424
- },
425
- {
426
- "model_name": "QuickDemo_MediumAccuracy",
427
- "organization": "QuickDemo_MediumAccuracy",
428
- "tokens": "User Submission (Simulated)",
429
- "accuracy": 0.516,
430
- "mg_pass_2": 0.516,
431
- "mg_pass_4": 0.516,
432
- "submitted_time": "2025-09-05",
433
- "results": {
434
- "sage_mathematics": 0.474,
435
- "sage_physics": 0.518,
436
- "sage_chemistry": 0.674,
437
- "sage_biology": 0.638,
438
- "sage_earth_science": 0.318,
439
- "sage_astronomy": 0.473,
440
- "sage_overall": 0.516
441
- },
442
- "contact_email": "medium@demo.com",
443
- "evaluation_timestamp": "2025-09-05T16:19:41.868409",
444
- "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
445
- }
446
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
initial_sage_results.json.backup_20250906_233132 DELETED
@@ -1,320 +0,0 @@
1
- [
2
- {
3
- "model_name": "Unknown",
4
- "organization": "Unknown",
5
- "tokens": "User Submission",
6
- "accuracy": 100.0,
7
- "mg_pass_2": 100.0,
8
- "mg_pass_4": 100.0,
9
- "submitted_time": "2025-09-05",
10
- "results": {
11
- "sage_mathematics": 100.0,
12
- "sage_physics": 100.0,
13
- "sage_chemistry": 100.0,
14
- "sage_biology": 100.0,
15
- "sage_earth_science": 0.0,
16
- "sage_astronomy": 0.0,
17
- "sage_overall": 100.0
18
- },
19
- "contact_email": "",
20
- "evaluation_timestamp": "2025-09-05T16:14:32.476871",
21
- "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
22
- },
23
- {
24
- "model_name": "OpenAI GPT-5-High",
25
- "organization": "OpenAI",
26
- "tokens": "64k",
27
- "accuracy": 45.2,
28
- "mg_pass_2": 36.6,
29
- "mg_pass_4": 35.1,
30
- "submitted_time": "2024-01-15",
31
- "results": {
32
- "sage_overall": 45.2,
33
- "sage_math": 48.5,
34
- "sage_physics": 44.1,
35
- "sage_chemistry": 42.8,
36
- "sage_biology": 46.3,
37
- "sage_earth_science": 43.7,
38
- "sage_astronomy": 45.8
39
- }
40
- },
41
- {
42
- "model_name": "Gemini-2.5-Pro",
43
- "organization": "Google",
44
- "tokens": "64k",
45
- "accuracy": 40.5,
46
- "mg_pass_2": 31.2,
47
- "mg_pass_4": 29.7,
48
- "submitted_time": "2024-01-14",
49
- "results": {
50
- "sage_overall": 40.5,
51
- "sage_math": 43.2,
52
- "sage_physics": 39.8,
53
- "sage_chemistry": 38.1,
54
- "sage_biology": 41.7,
55
- "sage_earth_science": 39.4,
56
- "sage_astronomy": 40.8
57
- }
58
- },
59
- {
60
- "model_name": "OpenAI o3-High",
61
- "organization": "OpenAI",
62
- "tokens": "64k",
63
- "accuracy": 39.6,
64
- "mg_pass_2": 26.0,
65
- "mg_pass_4": 27.3,
66
- "submitted_time": "2024-01-13",
67
- "results": {
68
- "sage_overall": 39.6,
69
- "sage_math": 42.1,
70
- "sage_physics": 38.5,
71
- "sage_chemistry": 37.2,
72
- "sage_biology": 40.8,
73
- "sage_earth_science": 38.1,
74
- "sage_astronomy": 40.9
75
- }
76
- },
77
- {
78
- "model_name": "Gemini-2.5-Pro",
79
- "organization": "Google",
80
- "tokens": "32k",
81
- "accuracy": 39.1,
82
- "mg_pass_2": 29.4,
83
- "mg_pass_4": 27.5,
84
- "submitted_time": "2024-01-12",
85
- "results": {
86
- "sage_overall": 39.1,
87
- "sage_math": 41.8,
88
- "sage_physics": 38.2,
89
- "sage_chemistry": 36.9,
90
- "sage_biology": 40.3,
91
- "sage_earth_science": 37.7,
92
- "sage_astronomy": 39.7
93
- }
94
- },
95
- {
96
- "model_name": "OpenAI o3-High",
97
- "organization": "OpenAI",
98
- "tokens": "32k",
99
- "accuracy": 38.5,
100
- "mg_pass_2": 26.4,
101
- "mg_pass_4": 24.2,
102
- "submitted_time": "2024-01-11",
103
- "results": {
104
- "sage_overall": 38.5,
105
- "sage_math": 41.2,
106
- "sage_physics": 37.8,
107
- "sage_chemistry": 36.1,
108
- "sage_biology": 39.9,
109
- "sage_earth_science": 37.3,
110
- "sage_astronomy": 38.7
111
- }
112
- },
113
- {
114
- "model_name": "Grok-4",
115
- "organization": "xAI",
116
- "tokens": "32k",
117
- "accuracy": 35.0,
118
- "mg_pass_2": 26.0,
119
- "mg_pass_4": 24.1,
120
- "submitted_time": "2024-01-10",
121
- "results": {
122
- "sage_overall": 35.0,
123
- "sage_math": 37.5,
124
- "sage_physics": 34.2,
125
- "sage_chemistry": 33.1,
126
- "sage_biology": 36.1,
127
- "sage_earth_science": 34.8,
128
- "sage_astronomy": 34.3
129
- }
130
- },
131
- {
132
- "model_name": "Qwen3-235B-A22B-2507",
133
- "organization": "Alibaba",
134
- "tokens": "32k",
135
- "accuracy": 27.8,
136
- "mg_pass_2": 19.8,
137
- "mg_pass_4": 18.1,
138
- "submitted_time": "2024-01-09",
139
- "results": {
140
- "sage_overall": 27.8,
141
- "sage_math": 29.8,
142
- "sage_physics": 27.1,
143
- "sage_chemistry": 26.5,
144
- "sage_biology": 28.4,
145
- "sage_earth_science": 27.9,
146
- "sage_astronomy": 27.1
147
- }
148
- },
149
- {
150
- "model_name": "Doubao-Seed-1.6-thinking",
151
- "organization": "ByteDance",
152
- "tokens": "32k",
153
- "accuracy": 27.7,
154
- "mg_pass_2": 18.4,
155
- "mg_pass_4": 16.8,
156
- "submitted_time": "2024-01-08",
157
- "results": {
158
- "sage_overall": 27.7,
159
- "sage_math": 29.6,
160
- "sage_physics": 27.0,
161
- "sage_chemistry": 26.3,
162
- "sage_biology": 28.2,
163
- "sage_earth_science": 27.7,
164
- "sage_astronomy": 27.4
165
- }
166
- },
167
- {
168
- "model_name": "DeepSeek-V3.1",
169
- "organization": "DeepSeek",
170
- "tokens": "64k",
171
- "accuracy": 27.7,
172
- "mg_pass_2": 18.3,
173
- "mg_pass_4": 16.5,
174
- "submitted_time": "2024-01-07",
175
- "results": {
176
- "sage_overall": 27.7,
177
- "sage_math": 29.5,
178
- "sage_physics": 26.9,
179
- "sage_chemistry": 26.2,
180
- "sage_biology": 28.1,
181
- "sage_earth_science": 27.6,
182
- "sage_astronomy": 27.9
183
- }
184
- },
185
- {
186
- "model_name": "DeepSeek-R1-0528",
187
- "organization": "DeepSeek",
188
- "tokens": "32k",
189
- "accuracy": 26.1,
190
- "mg_pass_2": 16.0,
191
- "mg_pass_4": 14.1,
192
- "submitted_time": "2024-01-06",
193
- "results": {
194
- "sage_overall": 26.1,
195
- "sage_math": 28.0,
196
- "sage_physics": 25.4,
197
- "sage_chemistry": 24.8,
198
- "sage_biology": 26.7,
199
- "sage_earth_science": 26.2,
200
- "sage_astronomy": 25.5
201
- }
202
- },
203
- {
204
- "model_name": "OpenAI o4-mini",
205
- "organization": "OpenAI",
206
- "tokens": "32k",
207
- "accuracy": 23.5,
208
- "mg_pass_2": 13.7,
209
- "mg_pass_4": 11.9,
210
- "submitted_time": "2024-01-05",
211
- "results": {
212
- "sage_overall": 23.5,
213
- "sage_math": 25.2,
214
- "sage_physics": 22.8,
215
- "sage_chemistry": 22.1,
216
- "sage_biology": 24.1,
217
- "sage_earth_science": 23.6,
218
- "sage_astronomy": 23.2
219
- }
220
- },
221
- {
222
- "model_name": "Qwen3-235B-A22B",
223
- "organization": "Alibaba",
224
- "tokens": "32k",
225
- "accuracy": 20.1,
226
- "mg_pass_2": 11.2,
227
- "mg_pass_4": 9.6,
228
- "submitted_time": "2024-01-04",
229
- "results": {
230
- "sage_overall": 20.1,
231
- "sage_math": 21.5,
232
- "sage_physics": 19.5,
233
- "sage_chemistry": 19.2,
234
- "sage_biology": 20.7,
235
- "sage_earth_science": 20.3,
236
- "sage_astronomy": 19.4
237
- }
238
- },
239
- {
240
- "model_name": "GLM-4.5-Thinking",
241
- "organization": "Zhipu AI",
242
- "tokens": "64k",
243
- "accuracy": 9.3,
244
- "mg_pass_2": 4.7,
245
- "mg_pass_4": 4.0,
246
- "submitted_time": "2024-01-03",
247
- "results": {
248
- "sage_overall": 9.3,
249
- "sage_math": 10.1,
250
- "sage_physics": 9.0,
251
- "sage_chemistry": 8.7,
252
- "sage_biology": 9.6,
253
- "sage_earth_science": 9.2,
254
- "sage_astronomy": 9.2
255
- }
256
- },
257
- {
258
- "model_name": "QuickDemo_TestOrg",
259
- "organization": "QuickDemo_TestOrg",
260
- "tokens": "User Submission (Simulated)",
261
- "accuracy": 0.619,
262
- "mg_pass_2": 0.619,
263
- "mg_pass_4": 0.619,
264
- "submitted_time": "2025-09-05",
265
- "results": {
266
- "sage_mathematics": 0.877,
267
- "sage_physics": 0.895,
268
- "sage_chemistry": 0.756,
269
- "sage_biology": 0.316,
270
- "sage_earth_science": 0.312,
271
- "sage_astronomy": 0.56,
272
- "sage_overall": 0.619
273
- },
274
- "contact_email": "test@demo.com",
275
- "evaluation_timestamp": "2025-09-05T16:19:39.864071",
276
- "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
277
- },
278
- {
279
- "model_name": "QuickDemo_HighAccuracy",
280
- "organization": "QuickDemo_HighAccuracy",
281
- "tokens": "User Submission (Simulated)",
282
- "accuracy": 0.598,
283
- "mg_pass_2": 0.598,
284
- "mg_pass_4": 0.598,
285
- "submitted_time": "2025-09-05",
286
- "results": {
287
- "sage_mathematics": 0.88,
288
- "sage_physics": 0.331,
289
- "sage_chemistry": 0.646,
290
- "sage_biology": 0.501,
291
- "sage_earth_science": 0.818,
292
- "sage_astronomy": 0.415,
293
- "sage_overall": 0.598
294
- },
295
- "contact_email": "high@demo.com",
296
- "evaluation_timestamp": "2025-09-05T16:19:43.874748",
297
- "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
298
- },
299
- {
300
- "model_name": "QuickDemo_MediumAccuracy",
301
- "organization": "QuickDemo_MediumAccuracy",
302
- "tokens": "User Submission (Simulated)",
303
- "accuracy": 0.516,
304
- "mg_pass_2": 0.516,
305
- "mg_pass_4": 0.516,
306
- "submitted_time": "2025-09-05",
307
- "results": {
308
- "sage_mathematics": 0.474,
309
- "sage_physics": 0.518,
310
- "sage_chemistry": 0.674,
311
- "sage_biology": 0.638,
312
- "sage_earth_science": 0.318,
313
- "sage_astronomy": 0.473,
314
- "sage_overall": 0.516
315
- },
316
- "contact_email": "medium@demo.com",
317
- "evaluation_timestamp": "2025-09-05T16:19:41.868409",
318
- "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
319
- }
320
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
initial_sage_results.json.backup_20250906_233407 DELETED
@@ -1,341 +0,0 @@
1
- [
2
- {
3
- "model_name": "Test_Organization",
4
- "organization": "Test_Organization",
5
- "tokens": "User Submission",
6
- "accuracy": 100.0,
7
- "mg_pass_2": 100.0,
8
- "mg_pass_4": 100.0,
9
- "submitted_time": "2025-09-06",
10
- "results": {
11
- "sage_mathematics": 100.0,
12
- "sage_physics": 0.0,
13
- "sage_chemistry": 100.0,
14
- "sage_biology": 0.0,
15
- "sage_earth_science": 0.0,
16
- "sage_astronomy": 0.0,
17
- "sage_overall": 100.0
18
- },
19
- "contact_email": "test@example.com",
20
- "evaluation_timestamp": "2025-09-06T23:29:34.267864",
21
- "result_file": "results/result_Test_Organization_20250906_233132.json"
22
- },
23
- {
24
- "model_name": "Unknown",
25
- "organization": "Unknown",
26
- "tokens": "User Submission",
27
- "accuracy": 100.0,
28
- "mg_pass_2": 100.0,
29
- "mg_pass_4": 100.0,
30
- "submitted_time": "2025-09-05",
31
- "results": {
32
- "sage_mathematics": 100.0,
33
- "sage_physics": 100.0,
34
- "sage_chemistry": 100.0,
35
- "sage_biology": 100.0,
36
- "sage_earth_science": 0.0,
37
- "sage_astronomy": 0.0,
38
- "sage_overall": 100.0
39
- },
40
- "contact_email": "",
41
- "evaluation_timestamp": "2025-09-05T16:14:32.476871",
42
- "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
43
- },
44
- {
45
- "model_name": "OpenAI GPT-5-High",
46
- "organization": "OpenAI",
47
- "tokens": "64k",
48
- "accuracy": 45.2,
49
- "mg_pass_2": 36.6,
50
- "mg_pass_4": 35.1,
51
- "submitted_time": "2024-01-15",
52
- "results": {
53
- "sage_overall": 45.2,
54
- "sage_math": 48.5,
55
- "sage_physics": 44.1,
56
- "sage_chemistry": 42.8,
57
- "sage_biology": 46.3,
58
- "sage_earth_science": 43.7,
59
- "sage_astronomy": 45.8
60
- }
61
- },
62
- {
63
- "model_name": "Gemini-2.5-Pro",
64
- "organization": "Google",
65
- "tokens": "64k",
66
- "accuracy": 40.5,
67
- "mg_pass_2": 31.2,
68
- "mg_pass_4": 29.7,
69
- "submitted_time": "2024-01-14",
70
- "results": {
71
- "sage_overall": 40.5,
72
- "sage_math": 43.2,
73
- "sage_physics": 39.8,
74
- "sage_chemistry": 38.1,
75
- "sage_biology": 41.7,
76
- "sage_earth_science": 39.4,
77
- "sage_astronomy": 40.8
78
- }
79
- },
80
- {
81
- "model_name": "OpenAI o3-High",
82
- "organization": "OpenAI",
83
- "tokens": "64k",
84
- "accuracy": 39.6,
85
- "mg_pass_2": 26.0,
86
- "mg_pass_4": 27.3,
87
- "submitted_time": "2024-01-13",
88
- "results": {
89
- "sage_overall": 39.6,
90
- "sage_math": 42.1,
91
- "sage_physics": 38.5,
92
- "sage_chemistry": 37.2,
93
- "sage_biology": 40.8,
94
- "sage_earth_science": 38.1,
95
- "sage_astronomy": 40.9
96
- }
97
- },
98
- {
99
- "model_name": "Gemini-2.5-Pro",
100
- "organization": "Google",
101
- "tokens": "32k",
102
- "accuracy": 39.1,
103
- "mg_pass_2": 29.4,
104
- "mg_pass_4": 27.5,
105
- "submitted_time": "2024-01-12",
106
- "results": {
107
- "sage_overall": 39.1,
108
- "sage_math": 41.8,
109
- "sage_physics": 38.2,
110
- "sage_chemistry": 36.9,
111
- "sage_biology": 40.3,
112
- "sage_earth_science": 37.7,
113
- "sage_astronomy": 39.7
114
- }
115
- },
116
- {
117
- "model_name": "OpenAI o3-High",
118
- "organization": "OpenAI",
119
- "tokens": "32k",
120
- "accuracy": 38.5,
121
- "mg_pass_2": 26.4,
122
- "mg_pass_4": 24.2,
123
- "submitted_time": "2024-01-11",
124
- "results": {
125
- "sage_overall": 38.5,
126
- "sage_math": 41.2,
127
- "sage_physics": 37.8,
128
- "sage_chemistry": 36.1,
129
- "sage_biology": 39.9,
130
- "sage_earth_science": 37.3,
131
- "sage_astronomy": 38.7
132
- }
133
- },
134
- {
135
- "model_name": "Grok-4",
136
- "organization": "xAI",
137
- "tokens": "32k",
138
- "accuracy": 35.0,
139
- "mg_pass_2": 26.0,
140
- "mg_pass_4": 24.1,
141
- "submitted_time": "2024-01-10",
142
- "results": {
143
- "sage_overall": 35.0,
144
- "sage_math": 37.5,
145
- "sage_physics": 34.2,
146
- "sage_chemistry": 33.1,
147
- "sage_biology": 36.1,
148
- "sage_earth_science": 34.8,
149
- "sage_astronomy": 34.3
150
- }
151
- },
152
- {
153
- "model_name": "Qwen3-235B-A22B-2507",
154
- "organization": "Alibaba",
155
- "tokens": "32k",
156
- "accuracy": 27.8,
157
- "mg_pass_2": 19.8,
158
- "mg_pass_4": 18.1,
159
- "submitted_time": "2024-01-09",
160
- "results": {
161
- "sage_overall": 27.8,
162
- "sage_math": 29.8,
163
- "sage_physics": 27.1,
164
- "sage_chemistry": 26.5,
165
- "sage_biology": 28.4,
166
- "sage_earth_science": 27.9,
167
- "sage_astronomy": 27.1
168
- }
169
- },
170
- {
171
- "model_name": "Doubao-Seed-1.6-thinking",
172
- "organization": "ByteDance",
173
- "tokens": "32k",
174
- "accuracy": 27.7,
175
- "mg_pass_2": 18.4,
176
- "mg_pass_4": 16.8,
177
- "submitted_time": "2024-01-08",
178
- "results": {
179
- "sage_overall": 27.7,
180
- "sage_math": 29.6,
181
- "sage_physics": 27.0,
182
- "sage_chemistry": 26.3,
183
- "sage_biology": 28.2,
184
- "sage_earth_science": 27.7,
185
- "sage_astronomy": 27.4
186
- }
187
- },
188
- {
189
- "model_name": "DeepSeek-V3.1",
190
- "organization": "DeepSeek",
191
- "tokens": "64k",
192
- "accuracy": 27.7,
193
- "mg_pass_2": 18.3,
194
- "mg_pass_4": 16.5,
195
- "submitted_time": "2024-01-07",
196
- "results": {
197
- "sage_overall": 27.7,
198
- "sage_math": 29.5,
199
- "sage_physics": 26.9,
200
- "sage_chemistry": 26.2,
201
- "sage_biology": 28.1,
202
- "sage_earth_science": 27.6,
203
- "sage_astronomy": 27.9
204
- }
205
- },
206
- {
207
- "model_name": "DeepSeek-R1-0528",
208
- "organization": "DeepSeek",
209
- "tokens": "32k",
210
- "accuracy": 26.1,
211
- "mg_pass_2": 16.0,
212
- "mg_pass_4": 14.1,
213
- "submitted_time": "2024-01-06",
214
- "results": {
215
- "sage_overall": 26.1,
216
- "sage_math": 28.0,
217
- "sage_physics": 25.4,
218
- "sage_chemistry": 24.8,
219
- "sage_biology": 26.7,
220
- "sage_earth_science": 26.2,
221
- "sage_astronomy": 25.5
222
- }
223
- },
224
- {
225
- "model_name": "OpenAI o4-mini",
226
- "organization": "OpenAI",
227
- "tokens": "32k",
228
- "accuracy": 23.5,
229
- "mg_pass_2": 13.7,
230
- "mg_pass_4": 11.9,
231
- "submitted_time": "2024-01-05",
232
- "results": {
233
- "sage_overall": 23.5,
234
- "sage_math": 25.2,
235
- "sage_physics": 22.8,
236
- "sage_chemistry": 22.1,
237
- "sage_biology": 24.1,
238
- "sage_earth_science": 23.6,
239
- "sage_astronomy": 23.2
240
- }
241
- },
242
- {
243
- "model_name": "Qwen3-235B-A22B",
244
- "organization": "Alibaba",
245
- "tokens": "32k",
246
- "accuracy": 20.1,
247
- "mg_pass_2": 11.2,
248
- "mg_pass_4": 9.6,
249
- "submitted_time": "2024-01-04",
250
- "results": {
251
- "sage_overall": 20.1,
252
- "sage_math": 21.5,
253
- "sage_physics": 19.5,
254
- "sage_chemistry": 19.2,
255
- "sage_biology": 20.7,
256
- "sage_earth_science": 20.3,
257
- "sage_astronomy": 19.4
258
- }
259
- },
260
- {
261
- "model_name": "GLM-4.5-Thinking",
262
- "organization": "Zhipu AI",
263
- "tokens": "64k",
264
- "accuracy": 9.3,
265
- "mg_pass_2": 4.7,
266
- "mg_pass_4": 4.0,
267
- "submitted_time": "2024-01-03",
268
- "results": {
269
- "sage_overall": 9.3,
270
- "sage_math": 10.1,
271
- "sage_physics": 9.0,
272
- "sage_chemistry": 8.7,
273
- "sage_biology": 9.6,
274
- "sage_earth_science": 9.2,
275
- "sage_astronomy": 9.2
276
- }
277
- },
278
- {
279
- "model_name": "QuickDemo_TestOrg",
280
- "organization": "QuickDemo_TestOrg",
281
- "tokens": "User Submission (Simulated)",
282
- "accuracy": 0.619,
283
- "mg_pass_2": 0.619,
284
- "mg_pass_4": 0.619,
285
- "submitted_time": "2025-09-05",
286
- "results": {
287
- "sage_mathematics": 0.877,
288
- "sage_physics": 0.895,
289
- "sage_chemistry": 0.756,
290
- "sage_biology": 0.316,
291
- "sage_earth_science": 0.312,
292
- "sage_astronomy": 0.56,
293
- "sage_overall": 0.619
294
- },
295
- "contact_email": "test@demo.com",
296
- "evaluation_timestamp": "2025-09-05T16:19:39.864071",
297
- "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
298
- },
299
- {
300
- "model_name": "QuickDemo_HighAccuracy",
301
- "organization": "QuickDemo_HighAccuracy",
302
- "tokens": "User Submission (Simulated)",
303
- "accuracy": 0.598,
304
- "mg_pass_2": 0.598,
305
- "mg_pass_4": 0.598,
306
- "submitted_time": "2025-09-05",
307
- "results": {
308
- "sage_mathematics": 0.88,
309
- "sage_physics": 0.331,
310
- "sage_chemistry": 0.646,
311
- "sage_biology": 0.501,
312
- "sage_earth_science": 0.818,
313
- "sage_astronomy": 0.415,
314
- "sage_overall": 0.598
315
- },
316
- "contact_email": "high@demo.com",
317
- "evaluation_timestamp": "2025-09-05T16:19:43.874748",
318
- "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
319
- },
320
- {
321
- "model_name": "QuickDemo_MediumAccuracy",
322
- "organization": "QuickDemo_MediumAccuracy",
323
- "tokens": "User Submission (Simulated)",
324
- "accuracy": 0.516,
325
- "mg_pass_2": 0.516,
326
- "mg_pass_4": 0.516,
327
- "submitted_time": "2025-09-05",
328
- "results": {
329
- "sage_mathematics": 0.474,
330
- "sage_physics": 0.518,
331
- "sage_chemistry": 0.674,
332
- "sage_biology": 0.638,
333
- "sage_earth_science": 0.318,
334
- "sage_astronomy": 0.473,
335
- "sage_overall": 0.516
336
- },
337
- "contact_email": "medium@demo.com",
338
- "evaluation_timestamp": "2025-09-05T16:19:41.868409",
339
- "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
340
- }
341
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
initial_sage_results.json.backup_20250906_233650 DELETED
@@ -1,362 +0,0 @@
1
- [
2
- {
3
- "model_name": "Test_Organization",
4
- "organization": "Test_Organization",
5
- "tokens": "User Submission",
6
- "accuracy": 100.0,
7
- "mg_pass_2": 100.0,
8
- "mg_pass_4": 100.0,
9
- "submitted_time": "2025-09-06",
10
- "results": {
11
- "sage_mathematics": 100.0,
12
- "sage_physics": 0.0,
13
- "sage_chemistry": 100.0,
14
- "sage_biology": 0.0,
15
- "sage_earth_science": 0.0,
16
- "sage_astronomy": 0.0,
17
- "sage_overall": 100.0
18
- },
19
- "contact_email": "test@example.com",
20
- "evaluation_timestamp": "2025-09-06T23:32:03.654608",
21
- "result_file": "results/result_Test_Organization_20250906_233407.json"
22
- },
23
- {
24
- "model_name": "Test_Organization",
25
- "organization": "Test_Organization",
26
- "tokens": "User Submission",
27
- "accuracy": 100.0,
28
- "mg_pass_2": 100.0,
29
- "mg_pass_4": 100.0,
30
- "submitted_time": "2025-09-06",
31
- "results": {
32
- "sage_mathematics": 100.0,
33
- "sage_physics": 0.0,
34
- "sage_chemistry": 100.0,
35
- "sage_biology": 0.0,
36
- "sage_earth_science": 0.0,
37
- "sage_astronomy": 0.0,
38
- "sage_overall": 100.0
39
- },
40
- "contact_email": "test@example.com",
41
- "evaluation_timestamp": "2025-09-06T23:29:34.267864",
42
- "result_file": "results/result_Test_Organization_20250906_233132.json"
43
- },
44
- {
45
- "model_name": "Unknown",
46
- "organization": "Unknown",
47
- "tokens": "User Submission",
48
- "accuracy": 100.0,
49
- "mg_pass_2": 100.0,
50
- "mg_pass_4": 100.0,
51
- "submitted_time": "2025-09-05",
52
- "results": {
53
- "sage_mathematics": 100.0,
54
- "sage_physics": 100.0,
55
- "sage_chemistry": 100.0,
56
- "sage_biology": 100.0,
57
- "sage_earth_science": 0.0,
58
- "sage_astronomy": 0.0,
59
- "sage_overall": 100.0
60
- },
61
- "contact_email": "",
62
- "evaluation_timestamp": "2025-09-05T16:14:32.476871",
63
- "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
64
- },
65
- {
66
- "model_name": "OpenAI GPT-5-High",
67
- "organization": "OpenAI",
68
- "tokens": "64k",
69
- "accuracy": 45.2,
70
- "mg_pass_2": 36.6,
71
- "mg_pass_4": 35.1,
72
- "submitted_time": "2024-01-15",
73
- "results": {
74
- "sage_overall": 45.2,
75
- "sage_math": 48.5,
76
- "sage_physics": 44.1,
77
- "sage_chemistry": 42.8,
78
- "sage_biology": 46.3,
79
- "sage_earth_science": 43.7,
80
- "sage_astronomy": 45.8
81
- }
82
- },
83
- {
84
- "model_name": "Gemini-2.5-Pro",
85
- "organization": "Google",
86
- "tokens": "64k",
87
- "accuracy": 40.5,
88
- "mg_pass_2": 31.2,
89
- "mg_pass_4": 29.7,
90
- "submitted_time": "2024-01-14",
91
- "results": {
92
- "sage_overall": 40.5,
93
- "sage_math": 43.2,
94
- "sage_physics": 39.8,
95
- "sage_chemistry": 38.1,
96
- "sage_biology": 41.7,
97
- "sage_earth_science": 39.4,
98
- "sage_astronomy": 40.8
99
- }
100
- },
101
- {
102
- "model_name": "OpenAI o3-High",
103
- "organization": "OpenAI",
104
- "tokens": "64k",
105
- "accuracy": 39.6,
106
- "mg_pass_2": 26.0,
107
- "mg_pass_4": 27.3,
108
- "submitted_time": "2024-01-13",
109
- "results": {
110
- "sage_overall": 39.6,
111
- "sage_math": 42.1,
112
- "sage_physics": 38.5,
113
- "sage_chemistry": 37.2,
114
- "sage_biology": 40.8,
115
- "sage_earth_science": 38.1,
116
- "sage_astronomy": 40.9
117
- }
118
- },
119
- {
120
- "model_name": "Gemini-2.5-Pro",
121
- "organization": "Google",
122
- "tokens": "32k",
123
- "accuracy": 39.1,
124
- "mg_pass_2": 29.4,
125
- "mg_pass_4": 27.5,
126
- "submitted_time": "2024-01-12",
127
- "results": {
128
- "sage_overall": 39.1,
129
- "sage_math": 41.8,
130
- "sage_physics": 38.2,
131
- "sage_chemistry": 36.9,
132
- "sage_biology": 40.3,
133
- "sage_earth_science": 37.7,
134
- "sage_astronomy": 39.7
135
- }
136
- },
137
- {
138
- "model_name": "OpenAI o3-High",
139
- "organization": "OpenAI",
140
- "tokens": "32k",
141
- "accuracy": 38.5,
142
- "mg_pass_2": 26.4,
143
- "mg_pass_4": 24.2,
144
- "submitted_time": "2024-01-11",
145
- "results": {
146
- "sage_overall": 38.5,
147
- "sage_math": 41.2,
148
- "sage_physics": 37.8,
149
- "sage_chemistry": 36.1,
150
- "sage_biology": 39.9,
151
- "sage_earth_science": 37.3,
152
- "sage_astronomy": 38.7
153
- }
154
- },
155
- {
156
- "model_name": "Grok-4",
157
- "organization": "xAI",
158
- "tokens": "32k",
159
- "accuracy": 35.0,
160
- "mg_pass_2": 26.0,
161
- "mg_pass_4": 24.1,
162
- "submitted_time": "2024-01-10",
163
- "results": {
164
- "sage_overall": 35.0,
165
- "sage_math": 37.5,
166
- "sage_physics": 34.2,
167
- "sage_chemistry": 33.1,
168
- "sage_biology": 36.1,
169
- "sage_earth_science": 34.8,
170
- "sage_astronomy": 34.3
171
- }
172
- },
173
- {
174
- "model_name": "Qwen3-235B-A22B-2507",
175
- "organization": "Alibaba",
176
- "tokens": "32k",
177
- "accuracy": 27.8,
178
- "mg_pass_2": 19.8,
179
- "mg_pass_4": 18.1,
180
- "submitted_time": "2024-01-09",
181
- "results": {
182
- "sage_overall": 27.8,
183
- "sage_math": 29.8,
184
- "sage_physics": 27.1,
185
- "sage_chemistry": 26.5,
186
- "sage_biology": 28.4,
187
- "sage_earth_science": 27.9,
188
- "sage_astronomy": 27.1
189
- }
190
- },
191
- {
192
- "model_name": "Doubao-Seed-1.6-thinking",
193
- "organization": "ByteDance",
194
- "tokens": "32k",
195
- "accuracy": 27.7,
196
- "mg_pass_2": 18.4,
197
- "mg_pass_4": 16.8,
198
- "submitted_time": "2024-01-08",
199
- "results": {
200
- "sage_overall": 27.7,
201
- "sage_math": 29.6,
202
- "sage_physics": 27.0,
203
- "sage_chemistry": 26.3,
204
- "sage_biology": 28.2,
205
- "sage_earth_science": 27.7,
206
- "sage_astronomy": 27.4
207
- }
208
- },
209
- {
210
- "model_name": "DeepSeek-V3.1",
211
- "organization": "DeepSeek",
212
- "tokens": "64k",
213
- "accuracy": 27.7,
214
- "mg_pass_2": 18.3,
215
- "mg_pass_4": 16.5,
216
- "submitted_time": "2024-01-07",
217
- "results": {
218
- "sage_overall": 27.7,
219
- "sage_math": 29.5,
220
- "sage_physics": 26.9,
221
- "sage_chemistry": 26.2,
222
- "sage_biology": 28.1,
223
- "sage_earth_science": 27.6,
224
- "sage_astronomy": 27.9
225
- }
226
- },
227
- {
228
- "model_name": "DeepSeek-R1-0528",
229
- "organization": "DeepSeek",
230
- "tokens": "32k",
231
- "accuracy": 26.1,
232
- "mg_pass_2": 16.0,
233
- "mg_pass_4": 14.1,
234
- "submitted_time": "2024-01-06",
235
- "results": {
236
- "sage_overall": 26.1,
237
- "sage_math": 28.0,
238
- "sage_physics": 25.4,
239
- "sage_chemistry": 24.8,
240
- "sage_biology": 26.7,
241
- "sage_earth_science": 26.2,
242
- "sage_astronomy": 25.5
243
- }
244
- },
245
- {
246
- "model_name": "OpenAI o4-mini",
247
- "organization": "OpenAI",
248
- "tokens": "32k",
249
- "accuracy": 23.5,
250
- "mg_pass_2": 13.7,
251
- "mg_pass_4": 11.9,
252
- "submitted_time": "2024-01-05",
253
- "results": {
254
- "sage_overall": 23.5,
255
- "sage_math": 25.2,
256
- "sage_physics": 22.8,
257
- "sage_chemistry": 22.1,
258
- "sage_biology": 24.1,
259
- "sage_earth_science": 23.6,
260
- "sage_astronomy": 23.2
261
- }
262
- },
263
- {
264
- "model_name": "Qwen3-235B-A22B",
265
- "organization": "Alibaba",
266
- "tokens": "32k",
267
- "accuracy": 20.1,
268
- "mg_pass_2": 11.2,
269
- "mg_pass_4": 9.6,
270
- "submitted_time": "2024-01-04",
271
- "results": {
272
- "sage_overall": 20.1,
273
- "sage_math": 21.5,
274
- "sage_physics": 19.5,
275
- "sage_chemistry": 19.2,
276
- "sage_biology": 20.7,
277
- "sage_earth_science": 20.3,
278
- "sage_astronomy": 19.4
279
- }
280
- },
281
- {
282
- "model_name": "GLM-4.5-Thinking",
283
- "organization": "Zhipu AI",
284
- "tokens": "64k",
285
- "accuracy": 9.3,
286
- "mg_pass_2": 4.7,
287
- "mg_pass_4": 4.0,
288
- "submitted_time": "2024-01-03",
289
- "results": {
290
- "sage_overall": 9.3,
291
- "sage_math": 10.1,
292
- "sage_physics": 9.0,
293
- "sage_chemistry": 8.7,
294
- "sage_biology": 9.6,
295
- "sage_earth_science": 9.2,
296
- "sage_astronomy": 9.2
297
- }
298
- },
299
- {
300
- "model_name": "QuickDemo_TestOrg",
301
- "organization": "QuickDemo_TestOrg",
302
- "tokens": "User Submission (Simulated)",
303
- "accuracy": 0.619,
304
- "mg_pass_2": 0.619,
305
- "mg_pass_4": 0.619,
306
- "submitted_time": "2025-09-05",
307
- "results": {
308
- "sage_mathematics": 0.877,
309
- "sage_physics": 0.895,
310
- "sage_chemistry": 0.756,
311
- "sage_biology": 0.316,
312
- "sage_earth_science": 0.312,
313
- "sage_astronomy": 0.56,
314
- "sage_overall": 0.619
315
- },
316
- "contact_email": "test@demo.com",
317
- "evaluation_timestamp": "2025-09-05T16:19:39.864071",
318
- "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
319
- },
320
- {
321
- "model_name": "QuickDemo_HighAccuracy",
322
- "organization": "QuickDemo_HighAccuracy",
323
- "tokens": "User Submission (Simulated)",
324
- "accuracy": 0.598,
325
- "mg_pass_2": 0.598,
326
- "mg_pass_4": 0.598,
327
- "submitted_time": "2025-09-05",
328
- "results": {
329
- "sage_mathematics": 0.88,
330
- "sage_physics": 0.331,
331
- "sage_chemistry": 0.646,
332
- "sage_biology": 0.501,
333
- "sage_earth_science": 0.818,
334
- "sage_astronomy": 0.415,
335
- "sage_overall": 0.598
336
- },
337
- "contact_email": "high@demo.com",
338
- "evaluation_timestamp": "2025-09-05T16:19:43.874748",
339
- "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
340
- },
341
- {
342
- "model_name": "QuickDemo_MediumAccuracy",
343
- "organization": "QuickDemo_MediumAccuracy",
344
- "tokens": "User Submission (Simulated)",
345
- "accuracy": 0.516,
346
- "mg_pass_2": 0.516,
347
- "mg_pass_4": 0.516,
348
- "submitted_time": "2025-09-05",
349
- "results": {
350
- "sage_mathematics": 0.474,
351
- "sage_physics": 0.518,
352
- "sage_chemistry": 0.674,
353
- "sage_biology": 0.638,
354
- "sage_earth_science": 0.318,
355
- "sage_astronomy": 0.473,
356
- "sage_overall": 0.516
357
- },
358
- "contact_email": "medium@demo.com",
359
- "evaluation_timestamp": "2025-09-05T16:19:41.868409",
360
- "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
361
- }
362
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
initial_sage_results.json.backup_20250907_010121 DELETED
@@ -1,383 +0,0 @@
1
- [
2
- {
3
- "model_name": "E2E_Test_Org",
4
- "organization": "E2E_Test_Org",
5
- "tokens": "User Submission",
6
- "accuracy": 100.0,
7
- "mg_pass_2": 100.0,
8
- "mg_pass_4": 100.0,
9
- "submitted_time": "2025-09-06",
10
- "results": {
11
- "sage_mathematics": 100.0,
12
- "sage_physics": 0.0,
13
- "sage_chemistry": 100.0,
14
- "sage_biology": 0.0,
15
- "sage_earth_science": 0.0,
16
- "sage_astronomy": 0.0,
17
- "sage_overall": 100.0
18
- },
19
- "contact_email": "e2e@test.com",
20
- "evaluation_timestamp": "2025-09-06T23:34:38.252926",
21
- "result_file": "results/result_E2E_Test_Org_20250906_233650.json"
22
- },
23
- {
24
- "model_name": "Test_Organization",
25
- "organization": "Test_Organization",
26
- "tokens": "User Submission",
27
- "accuracy": 100.0,
28
- "mg_pass_2": 100.0,
29
- "mg_pass_4": 100.0,
30
- "submitted_time": "2025-09-06",
31
- "results": {
32
- "sage_mathematics": 100.0,
33
- "sage_physics": 0.0,
34
- "sage_chemistry": 100.0,
35
- "sage_biology": 0.0,
36
- "sage_earth_science": 0.0,
37
- "sage_astronomy": 0.0,
38
- "sage_overall": 100.0
39
- },
40
- "contact_email": "test@example.com",
41
- "evaluation_timestamp": "2025-09-06T23:32:03.654608",
42
- "result_file": "results/result_Test_Organization_20250906_233407.json"
43
- },
44
- {
45
- "model_name": "Test_Organization",
46
- "organization": "Test_Organization",
47
- "tokens": "User Submission",
48
- "accuracy": 100.0,
49
- "mg_pass_2": 100.0,
50
- "mg_pass_4": 100.0,
51
- "submitted_time": "2025-09-06",
52
- "results": {
53
- "sage_mathematics": 100.0,
54
- "sage_physics": 0.0,
55
- "sage_chemistry": 100.0,
56
- "sage_biology": 0.0,
57
- "sage_earth_science": 0.0,
58
- "sage_astronomy": 0.0,
59
- "sage_overall": 100.0
60
- },
61
- "contact_email": "test@example.com",
62
- "evaluation_timestamp": "2025-09-06T23:29:34.267864",
63
- "result_file": "results/result_Test_Organization_20250906_233132.json"
64
- },
65
- {
66
- "model_name": "Unknown",
67
- "organization": "Unknown",
68
- "tokens": "User Submission",
69
- "accuracy": 100.0,
70
- "mg_pass_2": 100.0,
71
- "mg_pass_4": 100.0,
72
- "submitted_time": "2025-09-05",
73
- "results": {
74
- "sage_mathematics": 100.0,
75
- "sage_physics": 100.0,
76
- "sage_chemistry": 100.0,
77
- "sage_biology": 100.0,
78
- "sage_earth_science": 0.0,
79
- "sage_astronomy": 0.0,
80
- "sage_overall": 100.0
81
- },
82
- "contact_email": "",
83
- "evaluation_timestamp": "2025-09-05T16:14:32.476871",
84
- "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
85
- },
86
- {
87
- "model_name": "OpenAI GPT-5-High",
88
- "organization": "OpenAI",
89
- "tokens": "64k",
90
- "accuracy": 45.2,
91
- "mg_pass_2": 36.6,
92
- "mg_pass_4": 35.1,
93
- "submitted_time": "2024-01-15",
94
- "results": {
95
- "sage_overall": 45.2,
96
- "sage_math": 48.5,
97
- "sage_physics": 44.1,
98
- "sage_chemistry": 42.8,
99
- "sage_biology": 46.3,
100
- "sage_earth_science": 43.7,
101
- "sage_astronomy": 45.8
102
- }
103
- },
104
- {
105
- "model_name": "Gemini-2.5-Pro",
106
- "organization": "Google",
107
- "tokens": "64k",
108
- "accuracy": 40.5,
109
- "mg_pass_2": 31.2,
110
- "mg_pass_4": 29.7,
111
- "submitted_time": "2024-01-14",
112
- "results": {
113
- "sage_overall": 40.5,
114
- "sage_math": 43.2,
115
- "sage_physics": 39.8,
116
- "sage_chemistry": 38.1,
117
- "sage_biology": 41.7,
118
- "sage_earth_science": 39.4,
119
- "sage_astronomy": 40.8
120
- }
121
- },
122
- {
123
- "model_name": "OpenAI o3-High",
124
- "organization": "OpenAI",
125
- "tokens": "64k",
126
- "accuracy": 39.6,
127
- "mg_pass_2": 26.0,
128
- "mg_pass_4": 27.3,
129
- "submitted_time": "2024-01-13",
130
- "results": {
131
- "sage_overall": 39.6,
132
- "sage_math": 42.1,
133
- "sage_physics": 38.5,
134
- "sage_chemistry": 37.2,
135
- "sage_biology": 40.8,
136
- "sage_earth_science": 38.1,
137
- "sage_astronomy": 40.9
138
- }
139
- },
140
- {
141
- "model_name": "Gemini-2.5-Pro",
142
- "organization": "Google",
143
- "tokens": "32k",
144
- "accuracy": 39.1,
145
- "mg_pass_2": 29.4,
146
- "mg_pass_4": 27.5,
147
- "submitted_time": "2024-01-12",
148
- "results": {
149
- "sage_overall": 39.1,
150
- "sage_math": 41.8,
151
- "sage_physics": 38.2,
152
- "sage_chemistry": 36.9,
153
- "sage_biology": 40.3,
154
- "sage_earth_science": 37.7,
155
- "sage_astronomy": 39.7
156
- }
157
- },
158
- {
159
- "model_name": "OpenAI o3-High",
160
- "organization": "OpenAI",
161
- "tokens": "32k",
162
- "accuracy": 38.5,
163
- "mg_pass_2": 26.4,
164
- "mg_pass_4": 24.2,
165
- "submitted_time": "2024-01-11",
166
- "results": {
167
- "sage_overall": 38.5,
168
- "sage_math": 41.2,
169
- "sage_physics": 37.8,
170
- "sage_chemistry": 36.1,
171
- "sage_biology": 39.9,
172
- "sage_earth_science": 37.3,
173
- "sage_astronomy": 38.7
174
- }
175
- },
176
- {
177
- "model_name": "Grok-4",
178
- "organization": "xAI",
179
- "tokens": "32k",
180
- "accuracy": 35.0,
181
- "mg_pass_2": 26.0,
182
- "mg_pass_4": 24.1,
183
- "submitted_time": "2024-01-10",
184
- "results": {
185
- "sage_overall": 35.0,
186
- "sage_math": 37.5,
187
- "sage_physics": 34.2,
188
- "sage_chemistry": 33.1,
189
- "sage_biology": 36.1,
190
- "sage_earth_science": 34.8,
191
- "sage_astronomy": 34.3
192
- }
193
- },
194
- {
195
- "model_name": "Qwen3-235B-A22B-2507",
196
- "organization": "Alibaba",
197
- "tokens": "32k",
198
- "accuracy": 27.8,
199
- "mg_pass_2": 19.8,
200
- "mg_pass_4": 18.1,
201
- "submitted_time": "2024-01-09",
202
- "results": {
203
- "sage_overall": 27.8,
204
- "sage_math": 29.8,
205
- "sage_physics": 27.1,
206
- "sage_chemistry": 26.5,
207
- "sage_biology": 28.4,
208
- "sage_earth_science": 27.9,
209
- "sage_astronomy": 27.1
210
- }
211
- },
212
- {
213
- "model_name": "Doubao-Seed-1.6-thinking",
214
- "organization": "ByteDance",
215
- "tokens": "32k",
216
- "accuracy": 27.7,
217
- "mg_pass_2": 18.4,
218
- "mg_pass_4": 16.8,
219
- "submitted_time": "2024-01-08",
220
- "results": {
221
- "sage_overall": 27.7,
222
- "sage_math": 29.6,
223
- "sage_physics": 27.0,
224
- "sage_chemistry": 26.3,
225
- "sage_biology": 28.2,
226
- "sage_earth_science": 27.7,
227
- "sage_astronomy": 27.4
228
- }
229
- },
230
- {
231
- "model_name": "DeepSeek-V3.1",
232
- "organization": "DeepSeek",
233
- "tokens": "64k",
234
- "accuracy": 27.7,
235
- "mg_pass_2": 18.3,
236
- "mg_pass_4": 16.5,
237
- "submitted_time": "2024-01-07",
238
- "results": {
239
- "sage_overall": 27.7,
240
- "sage_math": 29.5,
241
- "sage_physics": 26.9,
242
- "sage_chemistry": 26.2,
243
- "sage_biology": 28.1,
244
- "sage_earth_science": 27.6,
245
- "sage_astronomy": 27.9
246
- }
247
- },
248
- {
249
- "model_name": "DeepSeek-R1-0528",
250
- "organization": "DeepSeek",
251
- "tokens": "32k",
252
- "accuracy": 26.1,
253
- "mg_pass_2": 16.0,
254
- "mg_pass_4": 14.1,
255
- "submitted_time": "2024-01-06",
256
- "results": {
257
- "sage_overall": 26.1,
258
- "sage_math": 28.0,
259
- "sage_physics": 25.4,
260
- "sage_chemistry": 24.8,
261
- "sage_biology": 26.7,
262
- "sage_earth_science": 26.2,
263
- "sage_astronomy": 25.5
264
- }
265
- },
266
- {
267
- "model_name": "OpenAI o4-mini",
268
- "organization": "OpenAI",
269
- "tokens": "32k",
270
- "accuracy": 23.5,
271
- "mg_pass_2": 13.7,
272
- "mg_pass_4": 11.9,
273
- "submitted_time": "2024-01-05",
274
- "results": {
275
- "sage_overall": 23.5,
276
- "sage_math": 25.2,
277
- "sage_physics": 22.8,
278
- "sage_chemistry": 22.1,
279
- "sage_biology": 24.1,
280
- "sage_earth_science": 23.6,
281
- "sage_astronomy": 23.2
282
- }
283
- },
284
- {
285
- "model_name": "Qwen3-235B-A22B",
286
- "organization": "Alibaba",
287
- "tokens": "32k",
288
- "accuracy": 20.1,
289
- "mg_pass_2": 11.2,
290
- "mg_pass_4": 9.6,
291
- "submitted_time": "2024-01-04",
292
- "results": {
293
- "sage_overall": 20.1,
294
- "sage_math": 21.5,
295
- "sage_physics": 19.5,
296
- "sage_chemistry": 19.2,
297
- "sage_biology": 20.7,
298
- "sage_earth_science": 20.3,
299
- "sage_astronomy": 19.4
300
- }
301
- },
302
- {
303
- "model_name": "GLM-4.5-Thinking",
304
- "organization": "Zhipu AI",
305
- "tokens": "64k",
306
- "accuracy": 9.3,
307
- "mg_pass_2": 4.7,
308
- "mg_pass_4": 4.0,
309
- "submitted_time": "2024-01-03",
310
- "results": {
311
- "sage_overall": 9.3,
312
- "sage_math": 10.1,
313
- "sage_physics": 9.0,
314
- "sage_chemistry": 8.7,
315
- "sage_biology": 9.6,
316
- "sage_earth_science": 9.2,
317
- "sage_astronomy": 9.2
318
- }
319
- },
320
- {
321
- "model_name": "QuickDemo_TestOrg",
322
- "organization": "QuickDemo_TestOrg",
323
- "tokens": "User Submission (Simulated)",
324
- "accuracy": 0.619,
325
- "mg_pass_2": 0.619,
326
- "mg_pass_4": 0.619,
327
- "submitted_time": "2025-09-05",
328
- "results": {
329
- "sage_mathematics": 0.877,
330
- "sage_physics": 0.895,
331
- "sage_chemistry": 0.756,
332
- "sage_biology": 0.316,
333
- "sage_earth_science": 0.312,
334
- "sage_astronomy": 0.56,
335
- "sage_overall": 0.619
336
- },
337
- "contact_email": "test@demo.com",
338
- "evaluation_timestamp": "2025-09-05T16:19:39.864071",
339
- "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
340
- },
341
- {
342
- "model_name": "QuickDemo_HighAccuracy",
343
- "organization": "QuickDemo_HighAccuracy",
344
- "tokens": "User Submission (Simulated)",
345
- "accuracy": 0.598,
346
- "mg_pass_2": 0.598,
347
- "mg_pass_4": 0.598,
348
- "submitted_time": "2025-09-05",
349
- "results": {
350
- "sage_mathematics": 0.88,
351
- "sage_physics": 0.331,
352
- "sage_chemistry": 0.646,
353
- "sage_biology": 0.501,
354
- "sage_earth_science": 0.818,
355
- "sage_astronomy": 0.415,
356
- "sage_overall": 0.598
357
- },
358
- "contact_email": "high@demo.com",
359
- "evaluation_timestamp": "2025-09-05T16:19:43.874748",
360
- "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
361
- },
362
- {
363
- "model_name": "QuickDemo_MediumAccuracy",
364
- "organization": "QuickDemo_MediumAccuracy",
365
- "tokens": "User Submission (Simulated)",
366
- "accuracy": 0.516,
367
- "mg_pass_2": 0.516,
368
- "mg_pass_4": 0.516,
369
- "submitted_time": "2025-09-05",
370
- "results": {
371
- "sage_mathematics": 0.474,
372
- "sage_physics": 0.518,
373
- "sage_chemistry": 0.674,
374
- "sage_biology": 0.638,
375
- "sage_earth_science": 0.318,
376
- "sage_astronomy": 0.473,
377
- "sage_overall": 0.516
378
- },
379
- "contact_email": "medium@demo.com",
380
- "evaluation_timestamp": "2025-09-05T16:19:41.868409",
381
- "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
382
- }
383
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/oss/__init__.py CHANGED
@@ -1 +1,9 @@
1
- # OSS module for SAGE-Bench
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OSS模块 - 处理阿里云OSS相关功能
3
+ """
4
+
5
+ from .oss_file_manager import OSSFileManager
6
+ from .oss_leaderboard_manager import OSSLeaderboardManager
7
+ from .oss_submission_handler import OSSSubmissionHandler
8
+
9
+ __all__ = ["OSSFileManager", "OSSLeaderboardManager", "OSSSubmissionHandler"]
src/oss/oss_file_manager.py CHANGED
@@ -182,6 +182,62 @@ class OSSFileManager:
182
  logger.error(f"检查文件存在性失败: {oss_file_path}, 错误: {e}")
183
  return False
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def list_latest_files_by_date(
186
  self,
187
  object_dir: str = "",
 
182
  logger.error(f"检查文件存在性失败: {oss_file_path}, 错误: {e}")
183
  return False
184
 
185
+ def download_file_content(self, oss_file_path: str) -> Optional[bytes]:
186
+ """
187
+ 下载OSS文件内容到内存
188
+
189
+ Args:
190
+ oss_file_path: OSS文件路径
191
+
192
+ Returns:
193
+ 文件内容(字节)或None
194
+ """
195
+ try:
196
+ result = self.bucket.get_object(oss_file_path)
197
+ content = result.read()
198
+ logger.info(f"下载文件内容成功: {oss_file_path} ({len(content)} bytes)")
199
+ return content
200
+ except Exception as e:
201
+ logger.error(f"下载文件内容失败: {oss_file_path}, 错误: {e}")
202
+ return None
203
+
204
+ def upload_file(self, local_file_path: str, oss_file_path: str) -> bool:
205
+ """
206
+ 上传本地文件到OSS(别名方法)
207
+
208
+ Args:
209
+ local_file_path: 本地文件路径
210
+ oss_file_path: OSS文件路径
211
+
212
+ Returns:
213
+ 上传是否成功
214
+ """
215
+ return self.upload_file_to_object(local_file_path, oss_file_path, replace=True)
216
+
217
+ def copy_file(self, source_path: str, target_path: str) -> bool:
218
+ """
219
+ 在OSS内部复制文件
220
+
221
+ Args:
222
+ source_path: 源文件路径
223
+ target_path: 目标文件路径
224
+
225
+ Returns:
226
+ 复制是否成功
227
+ """
228
+ try:
229
+ # 使用copy_object进行OSS内部复制
230
+ self.bucket.copy_object(
231
+ self.bucket_name, # 源bucket
232
+ source_path, # 源文件路径
233
+ target_path # 目标文件路径
234
+ )
235
+ logger.info(f"文件复制成功: {source_path} -> {target_path}")
236
+ return True
237
+ except Exception as e:
238
+ logger.error(f"文件复制失败: {source_path} -> {target_path}, 错误: {e}")
239
+ return False
240
+
241
  def list_latest_files_by_date(
242
  self,
243
  object_dir: str = "",
src/oss/oss_file_manager_old.py DELETED
@@ -1,517 +0,0 @@
1
- # %%
2
- import datetime as dt
3
- import os
4
- import re
5
- from logging import Logger
6
- from multiprocessing import Pool
7
- from multiprocessing.pool import ThreadPool
8
- from pathlib import Path
9
- from typing import List, Union
10
-
11
- import oss2
12
- from loguru import logger
13
- from oss2.credentials import EnvironmentVariableCredentialsProvider
14
- from tqdm import tqdm
15
-
16
- from compassflow.constants import DATADIR
17
- from compassflow.oss.oss import OssBucket
18
- from compassflow.utils import starstarmap
19
-
20
-
21
- class OSSFileManager:
22
- def __init__(
23
- self,
24
- oss_access_key_id: str = None,
25
- oss_access_key_secret: str = None,
26
- region: str = "http://oss-cn-shanghai.aliyuncs.com",
27
- bucket_name: str = "opencompass",
28
- oss_block_name: str = None,
29
- logger: Logger = logger,
30
- ) -> None:
31
- """OSS File Manager
32
-
33
- Args:
34
- oss_access_key_id (str, optional): _description_. Defaults to None.
35
- oss_access_key_secret (str, optional): _description_. Defaults to None.
36
- region (_type_, optional): _description_. Defaults to 'http://oss-cn-shanghai.aliyuncs.com'.
37
- bucket_name (str, optional): _description_. Defaults to 'opencompass'.
38
- oss_block_name: oss_block_name which is defined in the prefect
39
- logger (Logger, optional): _description_. Defaults to logger.
40
- """
41
- self.logger = logger
42
- if oss_block_name is not None:
43
- oss_bucket = OssBucket.load(oss_block_name)
44
- self._bucket = oss_bucket._get_bucket()
45
- return
46
-
47
- # 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录RAM控制台创建RAM账号。
48
- if oss_access_key_id is not None and oss_access_key_secret is not None:
49
- os.environ["OSS_ACCESS_KEY_ID"] = oss_access_key_id
50
- os.environ["OSS_ACCESS_KEY_SECRET"] = oss_access_key_secret
51
-
52
- if (
53
- os.getenv("OSS_ACCESS_KEY_ID") is None
54
- or os.getenv("OSS_ACCESS_KEY_SECRET") is None
55
- ):
56
- raise ValueError("Access Key ID and Access Key Secret cannot be empty.")
57
-
58
- auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
59
-
60
- # Endpoint以杭州为例,其它Region请按实际情况填写。
61
- # 填写Bucket名称,例如examplebucket。
62
- bucket = oss2.Bucket(
63
- auth=auth,
64
- endpoint=region,
65
- bucket_name=bucket_name,
66
- )
67
-
68
- self._bucket = bucket
69
-
70
- def list_latest_files_by_date(
71
- self,
72
- object_dir: str = "",
73
- delimiter: str = "/",
74
- start_date: Union[str, dt.date, dt.datetime] = None,
75
- end_date: Union[str, dt.date, dt.datetime] = None,
76
- max_num_files: int = 10,
77
- date_pattern: str = r"([0-9]{4}-[0-9]{2}-[0-9]{2})",
78
- file_date_format: str = "%Y-%m-%d",
79
- suffix: str = "",
80
- ) -> List[Union[str, Path]]:
81
- """List the latest files by date in an OSS bucket directory
82
-
83
- Args:
84
- object_dir (str, optional): _description_. Defaults to ''.
85
- delimiter (str, optional): _description_. Defaults to '/'.
86
- start_date (Union[str, dt.date], optional): _description_. Defaults to None.
87
- end_date (Union[str, dt.date], optional): _description_. Defaults to None.
88
- max_num_files (int, optional): _description_. Defaults to 10.
89
- date_pattern (str, optional): _description_. Defaults to r'^([0-9]{4}-[0-9]{2}-[0-9]{2})'.
90
- suffix (str, optional): _description_. Defaults to ''.
91
-
92
- Returns:
93
- List[Union[str, Path]]: _description_
94
- """
95
- if start_date is not None:
96
- if isinstance(start_date, str):
97
- start_date = dt.datetime.strptime(start_date.replace("-", ""), "%Y%m%d")
98
-
99
- if isinstance(start_date, dt.date):
100
- start_date = dt.datetime(
101
- start_date.year, start_date.month, start_date.day, 0, 0, 0
102
- )
103
-
104
- if end_date is not None:
105
- if isinstance(end_date, str):
106
- end_date = dt.datetime.strptime(end_date.replace("-", ""), "%Y%m%d")
107
-
108
- if isinstance(end_date, dt.date):
109
- end_date = dt.datetime(
110
- end_date.year, end_date.month, end_date.day, 0, 0, 0
111
- )
112
-
113
- object_iter = oss2.ObjectIterator(
114
- bucket=self._bucket, prefix=object_dir, delimiter=delimiter
115
- )
116
-
117
- root_dir = Path(object_dir)
118
- filenames = []
119
- for filename in object_iter:
120
- filename = filename.key.replace(object_dir, "")
121
- # print(filename)
122
-
123
- if filename.endswith(suffix):
124
- # Match date pattern in filename
125
- date_search = re.search(date_pattern, filename)
126
- if date_search:
127
- file_date = dt.datetime.strptime(
128
- date_search.group(1), file_date_format
129
- )
130
- else:
131
- self.logger.warning(
132
- f"date pattern doesn't match, skipping file {filename}"
133
- )
134
- continue
135
-
136
- # Check if file date within specified range
137
- if start_date is not None:
138
- if start_date > file_date:
139
- continue
140
-
141
- if end_date is not None:
142
- if end_date < file_date:
143
- continue
144
-
145
- filepath = root_dir / filename
146
- # name_tstamp_tuple = (filepath, os.path.getmtime(filepath))
147
- # filenames.append(name_tstamp_tuple)
148
- filenames.append(str(filepath))
149
- # sort by tstamp
150
- # filenames = sorted(filenames, key=lambda x: x[1])
151
- filenames = sorted(filenames)
152
- # filenames = [x[0] for x in filenames]
153
-
154
- max_num_files = max_num_files or len(filenames)
155
- filenames = filenames[-max_num_files:]
156
-
157
- self.logger.info(f"{filenames=}")
158
-
159
- return filenames
160
-
161
- def download_object_to_file(
162
- self,
163
- oss_file_path: str | Path,
164
- local_file_path: str | Path,
165
- replace: bool = False,
166
- make_dir: bool = False,
167
- print_logs: bool = True,
168
- ):
169
- """Download a single OSS object to local file.
170
-
171
- Args:
172
- oss_file_path (str): _description_
173
- local_file_path (str): _description_
174
- replace (bool, optional): _description_. Defaults to False.
175
- make_dir (bool, optional): Whether to create intermediate dirs if they don't exist. Defaults to False.
176
- print_logs: bool, optional): Whether to print logs. Defaults to True.
177
- """
178
- if isinstance(local_file_path, str):
179
- local_file_path = Path(local_file_path)
180
-
181
- if not replace:
182
- if local_file_path.exists():
183
- if print_logs:
184
- err_msg = f"{local_file_path} already exists, skipping file..."
185
- self.logger.info(err_msg)
186
-
187
- return
188
-
189
- if print_logs:
190
- if local_file_path.exists():
191
- err_msg = f"{local_file_path} already exists, replacing file..."
192
- self.logger.info(err_msg)
193
-
194
- if make_dir:
195
- os.makedirs(local_file_path.parent, exist_ok=True)
196
-
197
- self._bucket.get_object_to_file(
198
- key=str(oss_file_path),
199
- filename=local_file_path,
200
- )
201
-
202
- def download_objects_to_files(
203
- self,
204
- file_download_mapping: list[tuple[str | Path, str | Path]],
205
- oss_base_dir: str = None,
206
- local_base_dir: str = None,
207
- replace: bool = True,
208
- num_threads: int = 1,
209
- **kwargs,
210
- ) -> None:
211
- """Download objects from OSS to local storage.
212
-
213
- Args:
214
- file_download_mapping (list[tuple[str | Path, str | Path]]): A list of file path pairs that maps the OSS file path (to download) to the local file path (download location).
215
- oss_base_dir (str): OSS directory path to be prepended to all OSS file paths.
216
- local_base_dir (str, optional): Base directory path to be prepended to all local file paths.
217
- replace (bool, optional): _description_. Defaults to True.
218
- num_threads (int, optional): _description_. Defaults to 1.
219
- **kwargs: Additional keyword arguments passed to `list_latest_files_by_date`
220
- """
221
- if isinstance(oss_base_dir, str):
222
- oss_base_dir = Path(oss_base_dir)
223
-
224
- if isinstance(local_base_dir, str):
225
- local_base_dir = Path(local_base_dir)
226
-
227
- if not isinstance(file_download_mapping, list):
228
- raise TypeError("file_download_mapping must be a list of 2 value tuples.")
229
-
230
- for item in file_download_mapping:
231
- if not isinstance(item, tuple):
232
- raise TypeError(
233
- "Each item in the file_download_mapping list must be a 2 value tuple."
234
- )
235
-
236
- if len(item) != 2:
237
- raise ValueError(
238
- "Each tuple in the file_download_mapping list must be length 2."
239
- )
240
-
241
- if num_threads == 1:
242
- for local_file_path, oss_file_path in file_download_mapping:
243
- if local_base_dir is not None:
244
- local_file_path = local_base_dir / local_file_path
245
-
246
- if oss_base_dir is not None:
247
- oss_file_path = oss_base_dir / oss_file_path
248
-
249
- self.download_object_to_file(
250
- oss_file_path=oss_file_path,
251
- local_file_path=local_file_path,
252
- replace=replace,
253
- )
254
-
255
- return
256
-
257
- # Start multithreaded process if num_threads > 1
258
- with ThreadPool(num_threads) as p:
259
- pool_args_list = []
260
- for local_file_path, oss_file_path in file_download_mapping:
261
- if local_base_dir is not None:
262
- local_file_path = local_base_dir / local_file_path
263
-
264
- if oss_base_dir is not None:
265
- oss_file_path = oss_base_dir / oss_file_path
266
-
267
- args_dict = dict(
268
- oss_file_path=oss_file_path,
269
- local_file_path=local_file_path,
270
- replace=replace,
271
- print_logs=True,
272
- )
273
- pool_args_list.append(args_dict)
274
-
275
- ret_all = list(
276
- tqdm(
277
- starstarmap(
278
- pool=p,
279
- fn=self.download_object_to_file,
280
- kwargs_iter=pool_args_list,
281
- ),
282
- total=len(file_download_mapping),
283
- )
284
- )
285
-
286
- def download_latest_objects_to_dir(
287
- self,
288
- oss_object_dir: str,
289
- local_dir: str,
290
- start_date: Union[str, dt.date, dt.datetime] = None,
291
- end_date: Union[str, dt.date, dt.datetime] = None,
292
- date_pattern: str = r"([0-9]{4}-[0-9]{2}-[0-9]{2})",
293
- file_date_format: str = "%Y-%m-%d",
294
- max_num_files: int = 5,
295
- replace: bool = True,
296
- make_dir: bool = True,
297
- delimiter: str = "/",
298
- num_threads: int = 1,
299
- suffix: str = "",
300
- **kwargs,
301
- ) -> None:
302
- """Download the latest objects from oss to local directory
303
-
304
- Args:
305
- oss_object_dir (str): _description_
306
- local_dir (str): _description_
307
- start_date (Union[str, dt.date, dt.datetime], optional): _description_. Defaults to None.
308
- end_date (Union[str, dt.date, dt.datetime], optional): _description_. Defaults to None.
309
- max_num_files (int, optional): _description_. Defaults to 5.
310
- replace (bool, optional): _description_. Defaults to True.
311
- delimiter (str, optional): _description_. Defaults to '/'.
312
- num_threads (int, optional): _description_. Defaults to 1.
313
- suffix (str, optional): _description_. Defaults to ''.
314
- **kwargs: Additional keyword arguments passed to `list_latest_files_by_date`
315
- """
316
- if isinstance(local_dir, str):
317
- local_dir = Path(local_dir)
318
-
319
- oss_file_list = self.list_latest_files_by_date(
320
- object_dir=oss_object_dir,
321
- delimiter=delimiter,
322
- start_date=start_date,
323
- end_date=end_date,
324
- date_pattern=date_pattern,
325
- file_date_format=file_date_format,
326
- max_num_files=max_num_files,
327
- suffix=suffix,
328
- **kwargs,
329
- )
330
-
331
- if num_threads == 1:
332
- for oss_file_path in oss_file_list:
333
- file_name = Path(oss_file_path).name
334
- local_file_path = local_dir / file_name
335
-
336
- self.download_object_to_file(
337
- oss_file_path=oss_file_path,
338
- local_file_path=local_file_path,
339
- replace=replace,
340
- make_dir=make_dir,
341
- )
342
-
343
- return
344
-
345
- # Start multithreaded process if num_threads > 1
346
- with ThreadPool(num_threads) as p:
347
- pool_args_list = []
348
- for oss_file_path in oss_file_list:
349
- file_name = Path(oss_file_path).name
350
- local_file_path = local_dir / file_name
351
-
352
- args_dict = dict(
353
- oss_file_path=str(oss_file_path),
354
- local_file_path=str(local_file_path),
355
- replace=replace,
356
- make_dir=make_dir,
357
- print_logs=True,
358
- )
359
- pool_args_list.append(args_dict)
360
-
361
- ret_all = list(
362
- tqdm(
363
- starstarmap(
364
- pool=p,
365
- fn=self.download_object_to_file,
366
- kwargs_iter=pool_args_list,
367
- ),
368
- total=len(oss_file_list),
369
- )
370
- )
371
-
372
- def upload_file_to_object(
373
- self,
374
- local_file_path: str,
375
- oss_file_path: str | Path,
376
- replace: bool = False,
377
- print_logs: bool = True,
378
- ):
379
- """Upload a single local file to OSS
380
-
381
- Args:
382
- oss_file_path (str): _description_
383
- local_file_path (str): _description_
384
- replace (bool, optional): _description_. Defaults to False.
385
- """
386
- if isinstance(local_file_path, Path):
387
- local_file_path = str(local_file_path)
388
-
389
- if isinstance(oss_file_path, Path):
390
- oss_file_path = str(oss_file_path)
391
-
392
- # Check if file already exists
393
- is_file_exists = self._bucket.object_exists(
394
- key=oss_file_path,
395
- )
396
-
397
- if is_file_exists:
398
- if replace:
399
- if print_logs:
400
- err_msg = f"{oss_file_path} already exists, replacing file..."
401
- self.logger.info(err_msg)
402
-
403
- self._bucket.put_object_from_file(
404
- key=str(oss_file_path),
405
- filename=local_file_path,
406
- )
407
-
408
- else:
409
- if print_logs:
410
- err_msg = f"{oss_file_path} already exists, skipping file..."
411
- self.logger.info(err_msg)
412
-
413
- return
414
-
415
- self._bucket.put_object_from_file(
416
- key=oss_file_path,
417
- filename=local_file_path,
418
- )
419
-
420
- def upload_files_to_objects(
421
- self,
422
- file_upload_mapping: list[tuple[str | Path, str | Path]],
423
- local_base_dir: str = None,
424
- oss_base_dir: str = None,
425
- replace: bool = True,
426
- num_threads: int = 1,
427
- **kwargs,
428
- ) -> None:
429
- """Upload files from local storage to OSS.
430
-
431
- Args:
432
- file_upload_mapping (list[tuple[str | Path, str | Path]]): A list of file path pairs that maps the local file path (to upload) to the OSS file path (upload location).
433
- oss_base_dir (str): OSS directory path to be prepended to all OSS file paths.
434
- local_base_dir (str, optional): Base directory path to be prepended to all local file paths.
435
- replace (bool, optional): _description_. Defaults to True.
436
- num_threads (int, optional): _description_. Defaults to 1.
437
- **kwargs: Additional keyword arguments passed to `list_latest_files_by_date`
438
- """
439
- if isinstance(oss_base_dir, str):
440
- oss_base_dir = Path(oss_base_dir)
441
-
442
- if isinstance(local_base_dir, str):
443
- local_base_dir = Path(local_base_dir)
444
-
445
- if not isinstance(file_upload_mapping, list):
446
- raise TypeError("file_upload_mapping must be a list of 2 value tuples.")
447
-
448
- for item in file_upload_mapping:
449
- if not isinstance(item, tuple):
450
- raise TypeError(
451
- "Each item in the file_upload_mapping list must be a 2 value tuple."
452
- )
453
-
454
- if len(item) != 2:
455
- raise ValueError(
456
- "Each tuple in the file_upload_mapping list must be length 2."
457
- )
458
-
459
- if num_threads == 1:
460
- for local_file_path, oss_file_path in file_upload_mapping:
461
- if local_base_dir is not None:
462
- local_file_path = local_base_dir / local_file_path
463
-
464
- if oss_base_dir is not None:
465
- oss_file_path = oss_base_dir / oss_file_path
466
-
467
- self.upload_file_to_object(
468
- oss_file_path=oss_file_path,
469
- local_file_path=local_file_path,
470
- replace=replace,
471
- )
472
-
473
- return
474
-
475
- # Start multithreaded process if num_threads > 1
476
- with ThreadPool(num_threads) as p:
477
- pool_args_list = []
478
- for local_file_path, oss_file_path in file_upload_mapping:
479
- if local_base_dir is not None:
480
- local_file_path = local_base_dir / local_file_path
481
-
482
- if oss_base_dir is not None:
483
- oss_file_path = oss_base_dir / oss_file_path
484
-
485
- args_dict = dict(
486
- oss_file_path=oss_file_path,
487
- local_file_path=local_file_path,
488
- replace=replace,
489
- print_logs=True,
490
- )
491
- pool_args_list.append(args_dict)
492
-
493
- ret_all = list(
494
- tqdm(
495
- starstarmap(
496
- pool=p,
497
- fn=self.upload_file_to_object,
498
- kwargs_iter=pool_args_list,
499
- ),
500
- total=len(file_upload_mapping),
501
- )
502
- )
503
-
504
- @property
505
- def bucket(self) -> oss2.Bucket:
506
- return self._bucket
507
-
508
-
509
- # %%
510
- if __name__ == "__main__":
511
- # %% Initialize
512
- oss_file_manager = OSSFileManager(logger=logger)
513
-
514
- # %% List the latest files by date (based on file suffix) in an OSS directory
515
- oss_file_manager.list_latest_files_by_date(
516
- "compass-arena/dev/data/conversations/", max_num_files=5
517
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/oss/oss_file_manager_simple.py DELETED
@@ -1,254 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- 简化的OSS文件管理器 - 专为SAGE-Bench HuggingFace Space设计
4
- 移除了对 compassflow 的依赖,只保留必需的OSS功能
5
- """
6
-
7
- import os
8
- import oss2
9
- import json
10
- from datetime import datetime
11
- from pathlib import Path
12
- from typing import List, Dict, Optional
13
- from loguru import logger
14
-
15
-
16
- class OSSFileManager:
17
- """简化的OSS文件管理器"""
18
-
19
- def __init__(
20
- self,
21
- oss_access_key_id: str = None,
22
- oss_access_key_secret: str = None,
23
- oss_region: str = None,
24
- oss_bucket_name: str = None
25
- ):
26
- """
27
- 初始化OSS文件管理器
28
-
29
- Args:
30
- oss_access_key_id: OSS访问密钥ID
31
- oss_access_key_secret: OSS访问密钥Secret
32
- oss_region: OSS区域端点
33
- oss_bucket_name: OSS存储桶名称
34
- """
35
- # 从环境变量获取配置
36
- self.access_key_id = oss_access_key_id or os.getenv('OSS_ACCESS_KEY_ID')
37
- self.access_key_secret = oss_access_key_secret or os.getenv('OSS_ACCESS_KEY_SECRET')
38
- self.region = oss_region or os.getenv('OSS_REGION', 'http://oss-cn-shanghai.aliyuncs.com')
39
- self.bucket_name = oss_bucket_name or os.getenv('OSS_BUCKET_NAME', 'opencompass')
40
-
41
- if not self.access_key_id or not self.access_key_secret:
42
- raise ValueError("OSS访问密钥未设置。请设置 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET 环境变量。")
43
-
44
- # 初始化OSS客户端
45
- auth = oss2.Auth(self.access_key_id, self.access_key_secret)
46
- self.bucket = oss2.Bucket(auth, self.region, self.bucket_name)
47
-
48
- logger.info(f"OSS初始化成功: {self.bucket_name} @ {self.region}")
49
-
50
- def list_files(
51
- self,
52
- oss_dir: str = "",
53
- after_date: datetime = None,
54
- file_extension: str = None
55
- ) -> List[Dict]:
56
- """
57
- 列出OSS目录中的文件
58
-
59
- Args:
60
- oss_dir: OSS目录路径
61
- after_date: 只返回此日期之后的文件
62
- file_extension: 文件扩展名过滤 (如 ".json")
63
-
64
- Returns:
65
- 文件信息列表
66
- """
67
- try:
68
- files = []
69
-
70
- # 确保目录路径以 / 结尾
71
- if oss_dir and not oss_dir.endswith('/'):
72
- oss_dir += '/'
73
-
74
- # 列出对象
75
- for obj in oss2.ObjectIterator(self.bucket, prefix=oss_dir):
76
- # 跳过目录本身
77
- if obj.key.endswith('/'):
78
- continue
79
-
80
- # 文件扩展名过滤
81
- if file_extension and not obj.key.endswith(file_extension):
82
- continue
83
-
84
- # 日期过滤
85
- if after_date and obj.last_modified < after_date:
86
- continue
87
-
88
- file_info = {
89
- 'key': obj.key,
90
- 'name': os.path.basename(obj.key),
91
- 'size': obj.size,
92
- 'last_modified': obj.last_modified,
93
- 'etag': obj.etag
94
- }
95
- files.append(file_info)
96
-
97
- logger.info(f"找到 {len(files)} 个文件在 {oss_dir}")
98
- return files
99
-
100
- except Exception as e:
101
- logger.error(f"列出文件失败: {e}")
102
- raise
103
-
104
- def download_file(self, oss_file_path: str, local_file_path: str) -> bool:
105
- """
106
- 从OSS下载文件到本地
107
-
108
- Args:
109
- oss_file_path: OSS文件路径
110
- local_file_path: 本地文件路径
111
-
112
- Returns:
113
- 下载是否成功
114
- """
115
- try:
116
- # 确保本地目录存在
117
- local_dir = os.path.dirname(local_file_path)
118
- if local_dir:
119
- os.makedirs(local_dir, exist_ok=True)
120
-
121
- # 下载文件
122
- self.bucket.get_object_to_file(oss_file_path, local_file_path)
123
-
124
- logger.info(f"下载成功: {oss_file_path} -> {local_file_path}")
125
- return True
126
-
127
- except Exception as e:
128
- logger.error(f"下载文件失败: {oss_file_path} -> {local_file_path}, 错误: {e}")
129
- return False
130
-
131
- def upload_file_to_object(
132
- self,
133
- local_file_path: str,
134
- oss_file_path: str,
135
- replace: bool = False
136
- ) -> bool:
137
- """
138
- 上传本地文件到OSS
139
-
140
- Args:
141
- local_file_path: 本地文件路径
142
- oss_file_path: OSS文件路径
143
- replace: 是否替换已存在的文件
144
-
145
- Returns:
146
- 上传是否成功
147
- """
148
- try:
149
- # 检查本地文件是否存在
150
- if not os.path.exists(local_file_path):
151
- logger.error(f"本地文件不存在: {local_file_path}")
152
- return False
153
-
154
- # 检查OSS文件是否存在
155
- if not replace and self.bucket.object_exists(oss_file_path):
156
- logger.warning(f"OSS文件已存在: {oss_file_path}")
157
- return False
158
-
159
- # 上传文件
160
- self.bucket.put_object_from_file(oss_file_path, local_file_path)
161
-
162
- logger.info(f"上传成功: {local_file_path} -> {oss_file_path}")
163
- return True
164
-
165
- except Exception as e:
166
- logger.error(f"上传文件失败: {local_file_path} -> {oss_file_path}, 错误: {e}")
167
- return False
168
-
169
- def file_exists(self, oss_file_path: str) -> bool:
170
- """
171
- 检查OSS文件是否存在
172
-
173
- Args:
174
- oss_file_path: OSS文件路径
175
-
176
- Returns:
177
- 文件是否存在
178
- """
179
- try:
180
- return self.bucket.object_exists(oss_file_path)
181
- except Exception as e:
182
- logger.error(f"检查文件存在性失败: {oss_file_path}, 错误: {e}")
183
- return False
184
-
185
- def get_file_info(self, oss_file_path: str) -> Optional[Dict]:
186
- """
187
- 获取OSS文件信息
188
-
189
- Args:
190
- oss_file_path: OSS文件路径
191
-
192
- Returns:
193
- 文件信息字典
194
- """
195
- try:
196
- obj = self.bucket.get_object_meta(oss_file_path)
197
-
198
- return {
199
- 'key': oss_file_path,
200
- 'name': os.path.basename(oss_file_path),
201
- 'size': obj.content_length,
202
- 'last_modified': obj.last_modified,
203
- 'etag': obj.etag,
204
- 'content_type': obj.content_type
205
- }
206
-
207
- except oss2.exceptions.NoSuchKey:
208
- logger.warning(f"文件不存在: {oss_file_path}")
209
- return None
210
- except Exception as e:
211
- logger.error(f"获取文件信息失败: {oss_file_path}, 错误: {e}")
212
- return None
213
-
214
- def delete_file(self, oss_file_path: str) -> bool:
215
- """
216
- 删除OSS文件
217
-
218
- Args:
219
- oss_file_path: OSS文件路径
220
-
221
- Returns:
222
- 删除是否成功
223
- """
224
- try:
225
- self.bucket.delete_object(oss_file_path)
226
- logger.info(f"删除成功: {oss_file_path}")
227
- return True
228
-
229
- except Exception as e:
230
- logger.error(f"删除文件失败: {oss_file_path}, 错误: {e}")
231
- return False
232
-
233
-
234
- # 兼容性别名 - 保持与原始代码的兼容性
235
- class SimpleOSSManager(OSSFileManager):
236
- """兼容性别名"""
237
- pass
238
-
239
-
240
- if __name__ == "__main__":
241
- # 测试代码
242
- try:
243
- manager = OSSFileManager()
244
- print("✅ OSS文件管理器初始化成功")
245
-
246
- # 测试列出文件
247
- files = manager.list_files("atlas_eval/submissions/", file_extension=".json")
248
- print(f"📁 找到 {len(files)} 个提交文件")
249
-
250
- for file_info in files[:3]: # 只显示前3个
251
- print(f" - {file_info['name']} ({file_info['size']} bytes)")
252
-
253
- except Exception as e:
254
- print(f"❌ 测试失败: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/oss/oss_leaderboard_manager.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OSS排行榜管理器 - 从OSS读取和更新排行榜数据
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import tempfile
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Optional
12
+ from .oss_file_manager import OSSFileManager
13
+
14
+
15
+ class OSSLeaderboardManager:
16
+ """OSS排行榜管理器 - 管理存储在OSS中的排行榜数据"""
17
+
18
+ def __init__(self):
19
+ """初始化OSS排行榜管理器"""
20
+ self.oss_manager = OSSFileManager()
21
+
22
+ # OSS路径配置
23
+ self.leaderboard_path = "atlas_eval/leaderboard/"
24
+ self.backup_path = "atlas_eval/leaderboard/backup/"
25
+ self.leaderboard_file = "initial_sage_results.json"
26
+
27
+ # 完整的OSS路径
28
+ self.oss_leaderboard_file = f"{self.leaderboard_path}{self.leaderboard_file}"
29
+
30
+ print(f"📊 OSS排行榜路径: oss://opencompass/{self.oss_leaderboard_file}")
31
+ print(f"📦 OSS备份路径: oss://opencompass/{self.backup_path}")
32
+
33
+ def load_leaderboard_from_oss(self) -> List[Dict[str, Any]]:
34
+ """
35
+ 从OSS加载排行榜数据
36
+
37
+ Returns:
38
+ 排行榜数据列表
39
+ """
40
+ try:
41
+ print(f"📥 从OSS加载排行榜数据: {self.oss_leaderboard_file}")
42
+
43
+ # 从OSS下载文件内容
44
+ content = self.oss_manager.download_file_content(self.oss_leaderboard_file)
45
+
46
+ if content:
47
+ leaderboard_data = json.loads(content.decode('utf-8'))
48
+ print(f"✅ 成功加载 {len(leaderboard_data)} 条排行榜记录")
49
+ return leaderboard_data
50
+ else:
51
+ print("⚠️ OSS中未找到排行榜文件,返回空列表")
52
+ return []
53
+
54
+ except Exception as e:
55
+ print(f"❌ 从OSS加载排行榜失败: {e}")
56
+ return []
57
+
58
+ def save_leaderboard_to_oss(self, leaderboard_data: List[Dict[str, Any]],
59
+ create_backup: bool = True) -> bool:
60
+ """
61
+ 保存排行榜数据到OSS
62
+
63
+ Args:
64
+ leaderboard_data: 排行榜数据
65
+ create_backup: 是否创建备份
66
+
67
+ Returns:
68
+ 是否保存成功
69
+ """
70
+ try:
71
+ print(f"📤 保存排行榜数据到OSS: {self.oss_leaderboard_file}")
72
+
73
+ # 创建备份(如果需要且现有文件存在)
74
+ if create_backup:
75
+ self._create_backup()
76
+
77
+ # 创建临时文件
78
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
79
+ json.dump(leaderboard_data, temp_file, indent=2, ensure_ascii=False)
80
+ temp_file_path = temp_file.name
81
+
82
+ try:
83
+ # 上传到OSS
84
+ success = self.oss_manager.upload_file(
85
+ local_file_path=temp_file_path,
86
+ oss_file_path=self.oss_leaderboard_file
87
+ )
88
+
89
+ if success:
90
+ print(f"✅ 成功保存 {len(leaderboard_data)} 条排行榜记录到OSS")
91
+ return True
92
+ else:
93
+ print("❌ 上传排行榜文件到OSS失败")
94
+ return False
95
+
96
+ finally:
97
+ # 清理临时文件
98
+ try:
99
+ os.unlink(temp_file_path)
100
+ except:
101
+ pass
102
+
103
+ except Exception as e:
104
+ print(f"❌ 保存排行榜到OSS失败: {e}")
105
+ return False
106
+
107
+ def _create_backup(self) -> bool:
108
+ """
109
+ 创建当前排行榜文件的备份
110
+
111
+ Returns:
112
+ 是否备份成功
113
+ """
114
+ try:
115
+ # 检查原文件是否存在
116
+ if not self.oss_manager.file_exists(self.oss_leaderboard_file):
117
+ print("📋 原排行榜文件不存在,跳过备份")
118
+ return True
119
+
120
+ # 生成备份文件名
121
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
122
+ backup_filename = f"initial_sage_results.json.backup_{timestamp}"
123
+ backup_path = f"{self.backup_path}{backup_filename}"
124
+
125
+ # 复制文件到备份路径
126
+ success = self.oss_manager.copy_file(
127
+ source_path=self.oss_leaderboard_file,
128
+ target_path=backup_path
129
+ )
130
+
131
+ if success:
132
+ print(f"📦 创建备份成功: {backup_path}")
133
+ return True
134
+ else:
135
+ print(f"❌ 创建备份失败: {backup_path}")
136
+ return False
137
+
138
+ except Exception as e:
139
+ print(f"❌ 创建备份时出错: {e}")
140
+ return False
141
+
142
+ def add_evaluation_result(self, result_data: Dict[str, Any]) -> bool:
143
+ """
144
+ 添加新的评测结果到排行榜
145
+
146
+ Args:
147
+ result_data: 评测结果数据
148
+
149
+ Returns:
150
+ 是否添加成功
151
+ """
152
+ try:
153
+ # 加载现有排行榜
154
+ leaderboard_data = self.load_leaderboard_from_oss()
155
+
156
+ # 检查是否已存在相同的提交(基于organization和submitted_time)
157
+ existing_entry = None
158
+ for i, entry in enumerate(leaderboard_data):
159
+ if (entry.get("organization") == result_data.get("organization") and
160
+ entry.get("submitted_time") == result_data.get("submitted_time")):
161
+ existing_entry = i
162
+ break
163
+
164
+ if existing_entry is not None:
165
+ print(f"🔄 更新现有排行榜条目: {result_data.get('organization')}")
166
+ leaderboard_data[existing_entry] = result_data
167
+ else:
168
+ print(f"➕ 添加新的排行榜条目: {result_data.get('organization')}")
169
+ leaderboard_data.append(result_data)
170
+
171
+ # 按总分排序
172
+ leaderboard_data.sort(
173
+ key=lambda x: x.get("results", {}).get("sage_overall", 0),
174
+ reverse=True
175
+ )
176
+
177
+ # 保存到OSS
178
+ return self.save_leaderboard_to_oss(leaderboard_data)
179
+
180
+ except Exception as e:
181
+ print(f"❌ 添加评测结果失败: {e}")
182
+ return False
183
+
184
+ def get_leaderboard_summary(self) -> Dict[str, Any]:
185
+ """
186
+ 获取排行榜摘要信息
187
+
188
+ Returns:
189
+ 排行榜摘要
190
+ """
191
+ try:
192
+ leaderboard_data = self.load_leaderboard_from_oss()
193
+
194
+ if not leaderboard_data:
195
+ return {"total_entries": 0, "last_updated": None}
196
+
197
+ # 统计信息
198
+ total_entries = len(leaderboard_data)
199
+
200
+ # 获取最新更新时间
201
+ latest_time = None
202
+ for entry in leaderboard_data:
203
+ eval_time = entry.get("evaluation_timestamp")
204
+ if eval_time and (latest_time is None or eval_time > latest_time):
205
+ latest_time = eval_time
206
+
207
+ # 获取最高分
208
+ top_scores = {}
209
+ if leaderboard_data:
210
+ top_entry = leaderboard_data[0] # 已按分数排序
211
+ top_scores = top_entry.get("results", {})
212
+
213
+ return {
214
+ "total_entries": total_entries,
215
+ "last_updated": latest_time,
216
+ "top_scores": top_scores,
217
+ "oss_path": self.oss_leaderboard_file
218
+ }
219
+
220
+ except Exception as e:
221
+ print(f"❌ 获取排行榜摘要失败: {e}")
222
+ return {"error": str(e)}
223
+
224
+ def migrate_local_to_oss(self, local_file_path: str) -> bool:
225
+ """
226
+ 将本地排行榜文件迁移到OSS
227
+
228
+ Args:
229
+ local_file_path: 本地文件路径
230
+
231
+ Returns:
232
+ 是否迁移成功
233
+ """
234
+ try:
235
+ if not os.path.exists(local_file_path):
236
+ print(f"❌ 本地文件不存在: {local_file_path}")
237
+ return False
238
+
239
+ # 读取本地文件
240
+ with open(local_file_path, 'r', encoding='utf-8') as f:
241
+ leaderboard_data = json.load(f)
242
+
243
+ print(f"📤 迁移 {len(leaderboard_data)} 条记录到OSS")
244
+
245
+ # 保存到OSS
246
+ return self.save_leaderboard_to_oss(leaderboard_data, create_backup=False)
247
+
248
+ except Exception as e:
249
+ print(f"❌ 迁移文件到OSS失败: {e}")
250
+ return False
251
+
252
+
253
+ if __name__ == "__main__":
254
+ # 测试OSS排行榜管理器
255
+ manager = OSSLeaderboardManager()
256
+
257
+ # 打印摘要信息
258
+ summary = manager.get_leaderboard_summary()
259
+ print(f"📊 排行榜摘要: {summary}")
260
+
261
+ # 测试加载排行榜
262
+ leaderboard = manager.load_leaderboard_from_oss()
263
+ print(f"📋 排行榜条目数: {len(leaderboard)}")
src/populate.py CHANGED
@@ -74,7 +74,42 @@ try:
74
  return data_dict
75
 
76
  def load_initial_sage_results_local() -> List[SAGEResult]:
77
- """Load initial SAGE results without external dependencies"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  possible_paths = [
79
  "./initial_sage_results.json",
80
  "initial_sage_results.json",
@@ -87,18 +122,18 @@ try:
87
  initial_results_path = path
88
  break
89
 
90
- sage_results = []
91
-
92
  if initial_results_path:
93
  try:
94
  with open(initial_results_path, 'r') as f:
95
  initial_data = json.load(f)
96
 
 
 
97
  for i, entry in enumerate(initial_data):
98
  sage_result = SAGEResult(
99
- submission_id=f"initial_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
100
- organization=f"{entry['organization']} ({entry['tokens']})",
101
- email=f"contact@{entry['organization'].lower().replace(' ', '')}.com",
102
  results=entry["results"],
103
  num_predictions=1000,
104
  submitted_time=entry["submitted_time"],
@@ -107,9 +142,9 @@ try:
107
  sage_results.append(sage_result)
108
 
109
  except Exception as e:
110
- print(f"Error loading initial SAGE results from {initial_results_path}: {e}")
111
  else:
112
- print(f"Initial SAGE results file not found. Tried paths: {possible_paths}")
113
 
114
  return sage_results
115
 
 
74
  return data_dict
75
 
76
  def load_initial_sage_results_local() -> List[SAGEResult]:
77
+ """Load initial SAGE results from OSS or local files"""
78
+ sage_results = []
79
+
80
+ # 尝试从OSS加载
81
+ try:
82
+ # 导入OSS排行榜管理器(现在在本地oss目录中)
83
+ from src.oss.oss_leaderboard_manager import OSSLeaderboardManager
84
+
85
+ # 从OSS加载排行榜数据
86
+ leaderboard_manager = OSSLeaderboardManager()
87
+ initial_data = leaderboard_manager.load_leaderboard_from_oss()
88
+
89
+ if initial_data:
90
+ print(f"✅ 从OSS加载了 {len(initial_data)} 条排行榜记录")
91
+
92
+ for i, entry in enumerate(initial_data):
93
+ sage_result = SAGEResult(
94
+ submission_id=f"oss_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
95
+ organization=f"{entry['organization']} ({entry.get('tokens', 'N/A')})",
96
+ email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
97
+ results=entry["results"],
98
+ num_predictions=1000,
99
+ submitted_time=entry["submitted_time"],
100
+ status="EVALUATED"
101
+ )
102
+ sage_results.append(sage_result)
103
+
104
+ return sage_results
105
+ else:
106
+ print("⚠️ OSS中未找到排行榜数据,尝试本地文件")
107
+
108
+ except Exception as e:
109
+ print(f"⚠️ 从OSS加载排行榜失败: {e}")
110
+ print("🔄 回退到本地文件模式")
111
+
112
+ # 回退到本地文件模式
113
  possible_paths = [
114
  "./initial_sage_results.json",
115
  "initial_sage_results.json",
 
122
  initial_results_path = path
123
  break
124
 
 
 
125
  if initial_results_path:
126
  try:
127
  with open(initial_results_path, 'r') as f:
128
  initial_data = json.load(f)
129
 
130
+ print(f"✅ 从本地文件加载了 {len(initial_data)} 条排行榜记录: {initial_results_path}")
131
+
132
  for i, entry in enumerate(initial_data):
133
  sage_result = SAGEResult(
134
+ submission_id=f"local_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
135
+ organization=f"{entry['organization']} ({entry.get('tokens', 'N/A')})",
136
+ email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
137
  results=entry["results"],
138
  num_predictions=1000,
139
  submitted_time=entry["submitted_time"],
 
142
  sage_results.append(sage_result)
143
 
144
  except Exception as e:
145
+ print(f" 从本地文件加载排行榜失败 {initial_results_path}: {e}")
146
  else:
147
+ print(f" 未找到排行榜文件。尝试过的路径: {possible_paths}")
148
 
149
  return sage_results
150
 
submissions/submission_aaa_20250905_094557.json DELETED
@@ -1,81 +0,0 @@
1
- {
2
- "submission_org": "aaa",
3
- "submission_email": "nlp2ct.shudong@gmail.com",
4
- "predictions": [
5
- {
6
- "original_question_id": 0,
7
- "content": [
8
- "42",
9
- "The answer is 42",
10
- "Forty-two",
11
- "6 × 7"
12
- ],
13
- "reasoning_content": [
14
- "This is a mathematical calculation based on multiplication.",
15
- "The problem asks for the product of 6 and 7, which equals 42.",
16
- "Using basic arithmetic: 6 × 7 = 42",
17
- "The ultimate answer to life, the universe, and everything is 42."
18
- ]
19
- },
20
- {
21
- "original_question_id": 1,
22
- "content": [
23
- "H2O",
24
- "Water",
25
- "Dihydrogen monoxide",
26
- "Two hydrogen atoms and one oxygen atom"
27
- ],
28
- "reasoning_content": [
29
- "Water is composed of two hydrogen atoms and one oxygen atom.",
30
- "The chemical formula for water is H2O.",
31
- "This is the most common compound on Earth's surface.",
32
- "Water exists in three states: solid (ice), liquid (water), and gas (steam)."
33
- ]
34
- },
35
- {
36
- "original_question_id": 2,
37
- "content": [
38
- "DNA",
39
- "Deoxyribonucleic acid",
40
- "Genetic material",
41
- "Double helix structure"
42
- ],
43
- "reasoning_content": [
44
- "DNA carries genetic information in living organisms.",
45
- "It has a double helix structure discovered by Watson and Crick.",
46
- "DNA is composed of four nucleotides: A, T, G, C.",
47
- "It's located primarily in the cell nucleus."
48
- ]
49
- },
50
- {
51
- "original_question_id": 3,
52
- "content": [
53
- "9.8 m/s²",
54
- "9.81 m/s²",
55
- "About 10 m/s²",
56
- "Standard gravity"
57
- ],
58
- "reasoning_content": [
59
- "Earth's gravitational acceleration is approximately 9.8 m/s².",
60
- "This value varies slightly depending on location and altitude.",
61
- "It's the acceleration experienced by objects in free fall.",
62
- "This constant is fundamental in physics calculations."
63
- ]
64
- },
65
- {
66
- "original_question_id": 4,
67
- "content": [
68
- "Photosynthesis",
69
- "6CO2 + 6H2O + light → C6H12O6 + 6O2",
70
- "Converting light to chemical energy",
71
- "Chlorophyll process"
72
- ],
73
- "reasoning_content": [
74
- "Photosynthesis is how plants convert sunlight into chemical energy.",
75
- "The process uses carbon dioxide, water, and sunlight to produce glucose.",
76
- "Chlorophyll in plant leaves captures light energy.",
77
- "Oxygen is released as a byproduct of this process."
78
- ]
79
- }
80
- ]
81
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
submissions/submission_aaa_20250905_094707.json DELETED
@@ -1,81 +0,0 @@
1
- {
2
- "submission_org": "aaa",
3
- "submission_email": "nlp2ct.shudong@gmail.com",
4
- "predictions": [
5
- {
6
- "original_question_id": 0,
7
- "content": [
8
- "42",
9
- "The answer is 42",
10
- "Forty-two",
11
- "6 × 7"
12
- ],
13
- "reasoning_content": [
14
- "This is a mathematical calculation based on multiplication.",
15
- "The problem asks for the product of 6 and 7, which equals 42.",
16
- "Using basic arithmetic: 6 × 7 = 42",
17
- "The ultimate answer to life, the universe, and everything is 42."
18
- ]
19
- },
20
- {
21
- "original_question_id": 1,
22
- "content": [
23
- "H2O",
24
- "Water",
25
- "Dihydrogen monoxide",
26
- "Two hydrogen atoms and one oxygen atom"
27
- ],
28
- "reasoning_content": [
29
- "Water is composed of two hydrogen atoms and one oxygen atom.",
30
- "The chemical formula for water is H2O.",
31
- "This is the most common compound on Earth's surface.",
32
- "Water exists in three states: solid (ice), liquid (water), and gas (steam)."
33
- ]
34
- },
35
- {
36
- "original_question_id": 2,
37
- "content": [
38
- "DNA",
39
- "Deoxyribonucleic acid",
40
- "Genetic material",
41
- "Double helix structure"
42
- ],
43
- "reasoning_content": [
44
- "DNA carries genetic information in living organisms.",
45
- "It has a double helix structure discovered by Watson and Crick.",
46
- "DNA is composed of four nucleotides: A, T, G, C.",
47
- "It's located primarily in the cell nucleus."
48
- ]
49
- },
50
- {
51
- "original_question_id": 3,
52
- "content": [
53
- "9.8 m/s²",
54
- "9.81 m/s²",
55
- "About 10 m/s²",
56
- "Standard gravity"
57
- ],
58
- "reasoning_content": [
59
- "Earth's gravitational acceleration is approximately 9.8 m/s².",
60
- "This value varies slightly depending on location and altitude.",
61
- "It's the acceleration experienced by objects in free fall.",
62
- "This constant is fundamental in physics calculations."
63
- ]
64
- },
65
- {
66
- "original_question_id": 4,
67
- "content": [
68
- "Photosynthesis",
69
- "6CO2 + 6H2O + light → C6H12O6 + 6O2",
70
- "Converting light to chemical energy",
71
- "Chlorophyll process"
72
- ],
73
- "reasoning_content": [
74
- "Photosynthesis is how plants convert sunlight into chemical energy.",
75
- "The process uses carbon dioxide, water, and sunlight to produce glucose.",
76
- "Chlorophyll in plant leaves captures light energy.",
77
- "Oxygen is released as a byproduct of this process."
78
- ]
79
- }
80
- ]
81
- }