lixuejing commited on
Commit
d48da61
·
1 Parent(s): 7148ead
src/about.py CHANGED
@@ -34,13 +34,13 @@ TITLE = """<h1 align="center" id="space-title">FlagEval-VLM Leaderboard</h1>"""
34
  # What does your leaderboard evaluate?
35
 
36
  INTRODUCTION_TEXT = """
37
- FlagEval-VLM Leaderboard旨在跟踪、排名和评估VLM。本排行榜由FlagEval平台提供相应算力和运行环境。
38
- 评估数据集是全部都是中文数据集以评估中文能力如需查看详情信息,请查阅‘关于’页面。
39
  如需对模型进行更全面的评测,可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台,体验更加完善的模型评测功能。
40
 
41
- The FlagEval-VLM Leaderboard aims to track, rank, and evaluate VLMs. This leaderboard is powered by the FlagEval platform, providing corresponding computational resources and runtime environment.
42
- The evaluation dataset consists entirely of Chinese data to assess Chinese language proficiency. For more detailed information, please refer to the 'About' page.
43
- For a more comprehensive evaluation of the model, you can log in to the [FlagEval](https://flageval.baai.ac.cn/) to experience more refined model evaluation functionalities
44
 
45
  """
46
  # Which evaluations are you running? how can people reproduce what you have?
 
34
  # What does your leaderboard evaluate?
35
 
36
  INTRODUCTION_TEXT = """
37
+ 欢迎使用FlagEval-VLM Leaderboard
38
+ FlagEval-VLM Leaderboard 旨在跟踪、排名和评估开放式视觉大语言模型(VLM)。本排行榜由FlagEval平台提供相应算力和运行环境。VLM构建了一种基于数据集的能力体系,依据所接入的开源数据集,我们总结出了数学,视觉、图表、通用、文字以及中文等六个能力维度,由此组成一个评测集合。
39
  如需对模型进行更全面的评测,可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台,体验更加完善的模型评测功能。
40
 
41
+ Welcome to the FlagEval-VLM Leaderboard!
42
+ The FlagEval-VLM Leaderboard is designed to track, rank and evaluate open Visual Large Language Models (VLMs). This leaderboard is powered by the FlagEval platform, which provides the appropriate arithmetic and runtime environment.
43
+ VLM builds a dataset-based competency system. Based on the accessed open source datasets, we summarize six competency dimensions, including Mathematical, Visual, Graphical, Generic, Textual, and Chinese, to form a collection of assessments.
44
 
45
  """
46
  # Which evaluations are you running? how can people reproduce what you have?
src/leaderboard/read_evals.py CHANGED
@@ -161,7 +161,6 @@ class EvalResult:
161
 
162
  def get_request_file_for_model(requests_path, model_name, precision):
163
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
164
- print("get_request_file_for_model", requests_path, model_name)
165
  request_files = os.path.join(
166
  requests_path,
167
  f"{model_name}_eval_request_*.json",
@@ -207,13 +206,10 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
207
  eval_results = {}
208
  for model_result_filepath in model_result_filepaths:
209
  # Creation of result
210
- print("model_result_filepath",model_result_filepath)
211
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
212
  eval_result.update_with_request_file(requests_path)
213
- print("eval_result", eval_result)
214
  if eval_result.full_model in dynamic_data:
215
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
216
- print("eval_result update dynamic", eval_result)
217
 
218
  # Store results of same eval together
219
  eval_name = eval_result.eval_name
 
161
 
162
  def get_request_file_for_model(requests_path, model_name, precision):
163
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
164
  request_files = os.path.join(
165
  requests_path,
166
  f"{model_name}_eval_request_*.json",
 
206
  eval_results = {}
207
  for model_result_filepath in model_result_filepaths:
208
  # Creation of result
 
209
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
210
  eval_result.update_with_request_file(requests_path)
 
211
  if eval_result.full_model in dynamic_data:
212
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
 
213
 
214
  # Store results of same eval together
215
  eval_name = eval_result.eval_name
src/tools/plots.py CHANGED
@@ -21,7 +21,6 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
21
  results_df = pd.DataFrame(raw_data)
22
  results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
23
  results_df.sort_values(by="date", inplace=True)
24
- print("results_df", results_df)
25
 
26
  # Step 2: Initialize the scores dictionary
27
  scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
@@ -39,14 +38,12 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
39
  current_date = row["date"]
40
  if task.benchmark == "Average":
41
  current_score = np.mean(list(row["results"].values()))
42
- print("average", current_score)
43
  else:
44
  if row["results"]:
45
  current_score = row["results"][task.benchmark]
46
  else:
47
  current_score = 0
48
 
49
- print(current_score,current_max, task.benchmark)
50
  if current_score > current_max:
51
  if current_date == last_date and len(scores[column]) > 0:
52
  scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
@@ -68,14 +65,11 @@ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
68
  """
69
  # Initialize the list to store DataFrames
70
  dfs = []
71
- print("score_df", scores_df)
72
  # Iterate over the cols and create a new DataFrame for each column
73
  for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
74
- print("col",col)
75
  d = scores_df[col].reset_index(drop=True)
76
  d["task"] = col
77
  dfs.append(d)
78
- print("dfs", dfs)
79
 
80
  # Concatenate all the created DataFrames
81
  concat_df = pd.concat(dfs, ignore_index=True)
@@ -103,7 +97,6 @@ def create_metric_plot_obj(
103
 
104
  # Filter the DataFrame based on the specified metrics
105
  df = df[df["task"].isin(metrics)]
106
- print("df", df)
107
  # Filter the human baselines based on the specified metrics
108
  filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
109
 
 
21
  results_df = pd.DataFrame(raw_data)
22
  results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
23
  results_df.sort_values(by="date", inplace=True)
 
24
 
25
  # Step 2: Initialize the scores dictionary
26
  scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
 
38
  current_date = row["date"]
39
  if task.benchmark == "Average":
40
  current_score = np.mean(list(row["results"].values()))
 
41
  else:
42
  if row["results"]:
43
  current_score = row["results"][task.benchmark]
44
  else:
45
  current_score = 0
46
 
 
47
  if current_score > current_max:
48
  if current_date == last_date and len(scores[column]) > 0:
49
  scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
 
65
  """
66
  # Initialize the list to store DataFrames
67
  dfs = []
 
68
  # Iterate over the cols and create a new DataFrame for each column
69
  for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
 
70
  d = scores_df[col].reset_index(drop=True)
71
  d["task"] = col
72
  dfs.append(d)
 
73
 
74
  # Concatenate all the created DataFrames
75
  concat_df = pd.concat(dfs, ignore_index=True)
 
97
 
98
  # Filter the DataFrame based on the specified metrics
99
  df = df[df["task"].isin(metrics)]
 
100
  # Filter the human baselines based on the specified metrics
101
  filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
102