Spaces:
Running
Running
import copy as cp | |
import json | |
from collections import defaultdict | |
from urllib.request import urlopen | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from decimal import Decimal, ROUND_HALF_UP | |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS | |
def listinstr(lst, s): | |
assert isinstance(lst, list) | |
for item in lst: | |
if item in s: | |
return True | |
return False | |
def load_results(file_name=OVERALL_MATH_SCORE_FILE): | |
data = json.loads(open(file_name, "r").read()) | |
return data | |
def format_timestamp(timestamp): | |
date = timestamp[:10] | |
time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] | |
return date + ' ' + time | |
def nth_large(val, vals): | |
return sum([1 for v in vals if v > val]) + 1 | |
def BUILD_L1_DF(results, fields): | |
check_box = {} | |
check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] | |
# First check which columns exist in the actual data structure | |
sample_data = next(iter(results.values())) | |
available_fields = [] | |
for field in fields: | |
if field in sample_data: | |
available_fields.append(field) | |
# Build column names, ensure they match exactly with those in generate_table function | |
score_columns = [f"{field}-Score" for field in available_fields] | |
cost_columns = [f"{field}-Cost($)" for field in available_fields] | |
combined_columns = score_columns + cost_columns | |
combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0]) | |
check_box['required'] = ['Avg Score'] + combined_columns_sorted | |
check_box['all'] = ['Avg Score'] + combined_columns_sorted | |
type_map = defaultdict(lambda: 'number') | |
type_map['Algorithm'] = 'html' | |
type_map['LLM'] = type_map['Vision Model'] = 'html' | |
type_map['Eval Date'] = 'str' | |
type_map['Avg Score'] = 'number' | |
type_map['gsm8k-Score'] = 'number' | |
type_map['AQuA-Score'] = 'number' | |
type_map['gsm8k-Cost($)'] = 'number' | |
type_map['AQuA-Cost($)'] = 'number' | |
check_box['type_map'] = type_map | |
return check_box | |
def BUILD_L2_DF(results, fields): | |
res = defaultdict(list) | |
# Iterate over each algorithm and its corresponding models | |
for algo_name, algo_data in results.items(): | |
for model_name, model_data in algo_data.items(): | |
# Get META information | |
meta = model_data['META'] | |
# Create a record for each dataset | |
for dataset in fields: | |
if dataset not in model_data: | |
continue | |
# Add metadata | |
for k, v in meta.items(): | |
res[k].append(v) | |
# Add dataset name | |
res['Dataset'].append(dataset) | |
# Get dataset data | |
dataset_data = model_data[dataset] | |
# Add all fields | |
for field, value in dataset_data.items(): | |
res[field].append(value) | |
# Create DataFrame | |
df = pd.DataFrame(res) | |
# Sort by Dataset and Score in descending order | |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
# Add rank for each dataset separately | |
df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
# Rearrange column order | |
columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[columns + remaining_columns] | |
# Set checkbox configuration | |
check_box = {} | |
check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] | |
check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)'] | |
check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] | |
type_map = defaultdict(lambda: 'number') | |
type_map['Algorithm'] = 'html' | |
type_map['LLM'] = type_map['Vision Model'] = 'html' | |
type_map['Eval Date'] = 'str' | |
type_map['Dataset'] = 'str' | |
type_map['All tokens'] = 'number' | |
type_map['Cost($)'] = 'number' | |
check_box['type_map'] = type_map | |
return df, check_box | |
def generate_table(results, fields): | |
res = defaultdict(list) | |
for i, m in enumerate(results): | |
item = results[m] | |
meta = item['META'] | |
for k in META_FIELDS: | |
res[k].append(meta[k]) | |
scores, costs = [], [] | |
# Ensure column names format matches with BUILD_L1_DF | |
for d in fields: | |
if d in item: | |
score = item[d].get("Score") | |
cost = item[d].get("Cost($)") | |
res[f"{d}-Score"].append(score) | |
res[f"{d}-Cost($)"].append(cost) | |
if score is not None: | |
scores.append(score) | |
if cost is not None: | |
costs.append(cost) | |
else: | |
res[f"{d}-Score"].append(None) | |
res[f"{d}-Cost($)"].append(None) | |
# Calculate average score | |
if scores: | |
decimal_numbers = [Decimal(str(num)) for num in scores] | |
avg_score = Decimal(str(np.mean(scores) if scores else None)) | |
avg_score = sum(decimal_numbers) / len(decimal_numbers) | |
else: | |
avg_score = None | |
formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP) | |
res['Avg Score'].append(formatted_average) | |
df = pd.DataFrame(res) | |
# Sorting and ranking logic remains unchanged | |
valid = df[~pd.isna(df['Avg Score'])].copy() | |
missing = df[pd.isna(df['Avg Score'])].copy() | |
valid = valid.sort_values('Avg Score', ascending=False) | |
valid['Rank'] = range(1, len(valid) + 1) | |
if not missing.empty: | |
missing['Rank'] = len(valid) + 1 | |
df = pd.concat([valid, missing]) | |
df = df.sort_values('Rank') | |
# 重新排列列顺序 | |
columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] | |
for d in fields: | |
columns.extend([f"{d}-Score", f"{d}-Cost($)"]) | |
existing_columns = [col for col in columns if col in df.columns] | |
df = df[existing_columns] | |
return df | |
def generate_table_detail(results, fields): | |
res = defaultdict(list) | |
# Iterate over each algorithm and its corresponding models | |
for algo_name, algo_data in results.items(): | |
for model_name, model_data in algo_data.items(): | |
# Get META information | |
meta = model_data['META'] | |
# Create a record for each dataset | |
for dataset in fields: | |
if dataset not in model_data: | |
continue | |
# Add metadata | |
for k, v in meta.items(): | |
res[k].append(v) | |
# Add dataset name | |
res['Dataset'].append(dataset) | |
# Get dataset data | |
dataset_data = model_data[dataset] | |
# Add all fields | |
for field, value in dataset_data.items(): | |
res[field].append(value) | |
# Create DataFrame | |
df = pd.DataFrame(res) | |
# Sort by Dataset and Score in descending order | |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
# Add rank for each dataset separately | |
df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
# Rearrange column order | |
columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] | |
remaining_columns = [col for col in df.columns if col not in columns] | |
df = df[columns + remaining_columns] | |
return df |