lixuejing
commited on
Commit
·
d48da61
1
Parent(s):
7148ead
update
Browse files- src/about.py +5 -5
- src/leaderboard/read_evals.py +0 -4
- src/tools/plots.py +0 -7
src/about.py
CHANGED
@@ -34,13 +34,13 @@ TITLE = """<h1 align="center" id="space-title">FlagEval-VLM Leaderboard</h1>"""
|
|
34 |
# What does your leaderboard evaluate?
|
35 |
|
36 |
INTRODUCTION_TEXT = """
|
37 |
-
FlagEval-VLM Leaderboard
|
38 |
-
|
39 |
如需对模型进行更全面的评测,可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台,体验更加完善的模型评测功能。
|
40 |
|
41 |
-
|
42 |
-
The
|
43 |
-
|
44 |
|
45 |
"""
|
46 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
34 |
# What does your leaderboard evaluate?
|
35 |
|
36 |
INTRODUCTION_TEXT = """
|
37 |
+
欢迎使用FlagEval-VLM Leaderboard
|
38 |
+
FlagEval-VLM Leaderboard 旨在跟踪、排名和评估开放式视觉大语言模型(VLM)。本排行榜由FlagEval平台提供相应算力和运行环境。VLM构建了一种基于数据集的能力体系,依据所接入的开源数据集,我们总结出了数学,视觉、图表、通用、文字以及中文等六个能力维度,由此组成一个评测集合。
|
39 |
如需对模型进行更全面的评测,可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台,体验更加完善的模型评测功能。
|
40 |
|
41 |
+
Welcome to the FlagEval-VLM Leaderboard!
|
42 |
+
The FlagEval-VLM Leaderboard is designed to track, rank and evaluate open Visual Large Language Models (VLMs). This leaderboard is powered by the FlagEval platform, which provides the appropriate arithmetic and runtime environment.
|
43 |
+
VLM builds a dataset-based competency system. Based on the accessed open source datasets, we summarize six competency dimensions, including Mathematical, Visual, Graphical, Generic, Textual, and Chinese, to form a collection of assessments.
|
44 |
|
45 |
"""
|
46 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/leaderboard/read_evals.py
CHANGED
@@ -161,7 +161,6 @@ class EvalResult:
|
|
161 |
|
162 |
def get_request_file_for_model(requests_path, model_name, precision):
|
163 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
164 |
-
print("get_request_file_for_model", requests_path, model_name)
|
165 |
request_files = os.path.join(
|
166 |
requests_path,
|
167 |
f"{model_name}_eval_request_*.json",
|
@@ -207,13 +206,10 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
207 |
eval_results = {}
|
208 |
for model_result_filepath in model_result_filepaths:
|
209 |
# Creation of result
|
210 |
-
print("model_result_filepath",model_result_filepath)
|
211 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
212 |
eval_result.update_with_request_file(requests_path)
|
213 |
-
print("eval_result", eval_result)
|
214 |
if eval_result.full_model in dynamic_data:
|
215 |
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
216 |
-
print("eval_result update dynamic", eval_result)
|
217 |
|
218 |
# Store results of same eval together
|
219 |
eval_name = eval_result.eval_name
|
|
|
161 |
|
162 |
def get_request_file_for_model(requests_path, model_name, precision):
|
163 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
|
|
164 |
request_files = os.path.join(
|
165 |
requests_path,
|
166 |
f"{model_name}_eval_request_*.json",
|
|
|
206 |
eval_results = {}
|
207 |
for model_result_filepath in model_result_filepaths:
|
208 |
# Creation of result
|
|
|
209 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
210 |
eval_result.update_with_request_file(requests_path)
|
|
|
211 |
if eval_result.full_model in dynamic_data:
|
212 |
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
|
|
213 |
|
214 |
# Store results of same eval together
|
215 |
eval_name = eval_result.eval_name
|
src/tools/plots.py
CHANGED
@@ -21,7 +21,6 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
21 |
results_df = pd.DataFrame(raw_data)
|
22 |
results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
23 |
results_df.sort_values(by="date", inplace=True)
|
24 |
-
print("results_df", results_df)
|
25 |
|
26 |
# Step 2: Initialize the scores dictionary
|
27 |
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
|
@@ -39,14 +38,12 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
39 |
current_date = row["date"]
|
40 |
if task.benchmark == "Average":
|
41 |
current_score = np.mean(list(row["results"].values()))
|
42 |
-
print("average", current_score)
|
43 |
else:
|
44 |
if row["results"]:
|
45 |
current_score = row["results"][task.benchmark]
|
46 |
else:
|
47 |
current_score = 0
|
48 |
|
49 |
-
print(current_score,current_max, task.benchmark)
|
50 |
if current_score > current_max:
|
51 |
if current_date == last_date and len(scores[column]) > 0:
|
52 |
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
|
@@ -68,14 +65,11 @@ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
|
|
68 |
"""
|
69 |
# Initialize the list to store DataFrames
|
70 |
dfs = []
|
71 |
-
print("score_df", scores_df)
|
72 |
# Iterate over the cols and create a new DataFrame for each column
|
73 |
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
|
74 |
-
print("col",col)
|
75 |
d = scores_df[col].reset_index(drop=True)
|
76 |
d["task"] = col
|
77 |
dfs.append(d)
|
78 |
-
print("dfs", dfs)
|
79 |
|
80 |
# Concatenate all the created DataFrames
|
81 |
concat_df = pd.concat(dfs, ignore_index=True)
|
@@ -103,7 +97,6 @@ def create_metric_plot_obj(
|
|
103 |
|
104 |
# Filter the DataFrame based on the specified metrics
|
105 |
df = df[df["task"].isin(metrics)]
|
106 |
-
print("df", df)
|
107 |
# Filter the human baselines based on the specified metrics
|
108 |
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
109 |
|
|
|
21 |
results_df = pd.DataFrame(raw_data)
|
22 |
results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
23 |
results_df.sort_values(by="date", inplace=True)
|
|
|
24 |
|
25 |
# Step 2: Initialize the scores dictionary
|
26 |
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
|
|
|
38 |
current_date = row["date"]
|
39 |
if task.benchmark == "Average":
|
40 |
current_score = np.mean(list(row["results"].values()))
|
|
|
41 |
else:
|
42 |
if row["results"]:
|
43 |
current_score = row["results"][task.benchmark]
|
44 |
else:
|
45 |
current_score = 0
|
46 |
|
|
|
47 |
if current_score > current_max:
|
48 |
if current_date == last_date and len(scores[column]) > 0:
|
49 |
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
|
|
|
65 |
"""
|
66 |
# Initialize the list to store DataFrames
|
67 |
dfs = []
|
|
|
68 |
# Iterate over the cols and create a new DataFrame for each column
|
69 |
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
|
|
|
70 |
d = scores_df[col].reset_index(drop=True)
|
71 |
d["task"] = col
|
72 |
dfs.append(d)
|
|
|
73 |
|
74 |
# Concatenate all the created DataFrames
|
75 |
concat_df = pd.concat(dfs, ignore_index=True)
|
|
|
97 |
|
98 |
# Filter the DataFrame based on the specified metrics
|
99 |
df = df[df["task"].isin(metrics)]
|
|
|
100 |
# Filter the human baselines based on the specified metrics
|
101 |
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
102 |
|