Spaces:

BAAI
/

open_flageval_vlm_leaderboard

Running

App Files Files Community

lixuejing commited on Jan 13

Commit

d48da61

1 Parent(s): 7148ead

update

Browse files

Files changed (3) hide show

src/about.py +5 -5
src/leaderboard/read_evals.py +0 -4
src/tools/plots.py +0 -7

src/about.py CHANGED Viewed

@@ -34,13 +34,13 @@ TITLE = """<h1 align="center" id="space-title">FlagEval-VLM Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-FlagEval-VLM Leaderboard旨在跟踪、排名和评估VLM。本排行榜由FlagEval平台提供相应算力和运行环境。
-评估数据集是全部都是中文数据集以评估中文能力如需查看详情信息，请查阅‘关于’页面。
 如需对模型进行更全面的评测，可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台，体验更加完善的模型评测功能。
-The FlagEval-VLM Leaderboard aims to track, rank, and evaluate VLMs. This leaderboard is powered by the FlagEval platform, providing corresponding computational resources and runtime environment.
-The evaluation dataset consists entirely of Chinese data to assess Chinese language proficiency. For more detailed information, please refer to the 'About' page.
-For a more comprehensive evaluation of the model, you can log in to the [FlagEval](https://flageval.baai.ac.cn/) to experience more refined model evaluation functionalities
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+欢迎使用FlagEval-VLM Leaderboard
+FlagEval-VLM Leaderboard 旨在跟踪、排名和评估开放式视觉大语言模型（VLM）。本排行榜由FlagEval平台提供相应算力和运行环境。VLM构建了一种基于数据集的能力体系，依据所接入的开源数据集，我们总结出了数学，视觉、图表、通用、文字以及中文等六个能力维度，由此组成一个评测集合。
 如需对模型进行更全面的评测，可以登录 [FlagEval](https://flageval.baai.ac.cn/api/users/providers/hf)平台，体验更加完善的模型评测功能。
+Welcome to the FlagEval-VLM Leaderboard!
+The FlagEval-VLM Leaderboard is designed to track, rank and evaluate open Visual Large Language Models (VLMs). This leaderboard is powered by the FlagEval platform, which provides the appropriate arithmetic and runtime environment.
+VLM builds a dataset-based competency system. Based on the accessed open source datasets, we summarize six competency dimensions, including Mathematical, Visual, Graphical, Generic, Textual, and Chinese, to form a collection of assessments.
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/leaderboard/read_evals.py CHANGED Viewed

@@ -161,7 +161,6 @@ class EvalResult:
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    print("get_request_file_for_model", requests_path, model_name)
     request_files = os.path.join(
         requests_path,
         f"{model_name}_eval_request_*.json",
@@ -207,13 +206,10 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
-        print("model_result_filepath",model_result_filepath)
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
-        print("eval_result", eval_result)
         if eval_result.full_model in dynamic_data:
                 eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
-        print("eval_result update dynamic", eval_result)
         # Store results of same eval together
         eval_name = eval_result.eval_name

 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
         requests_path,
         f"{model_name}_eval_request_*.json",
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:
                 eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
         # Store results of same eval together
         eval_name = eval_result.eval_name

src/tools/plots.py CHANGED Viewed

@@ -21,7 +21,6 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     results_df = pd.DataFrame(raw_data)
     results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
-    print("results_df", results_df)
     # Step 2: Initialize the scores dictionary
     scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
@@ -39,14 +38,12 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
             current_date = row["date"]
             if task.benchmark == "Average":
                 current_score = np.mean(list(row["results"].values()))
-                print("average", current_score)
             else:
                 if row["results"]:
                     current_score = row["results"][task.benchmark]
                 else:
                     current_score = 0
-            print(current_score,current_max, task.benchmark)
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0:
                     scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
@@ -68,14 +65,11 @@ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
     """
     # Initialize the list to store DataFrames
     dfs = []
-    print("score_df", scores_df)
     # Iterate over the cols and create a new DataFrame for each column
     for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
-        print("col",col)
         d = scores_df[col].reset_index(drop=True)
         d["task"] = col
         dfs.append(d)
-        print("dfs", dfs)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
@@ -103,7 +97,6 @@ def create_metric_plot_obj(
     # Filter the DataFrame based on the specified metrics
     df = df[df["task"].isin(metrics)]
-    print("df", df)
     # Filter the human baselines based on the specified metrics
     filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}

     results_df = pd.DataFrame(raw_data)
     results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
     scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
             current_date = row["date"]
             if task.benchmark == "Average":
                 current_score = np.mean(list(row["results"].values()))
             else:
                 if row["results"]:
                     current_score = row["results"][task.benchmark]
                 else:
                     current_score = 0
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0:
                     scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
     """
     # Initialize the list to store DataFrames
     dfs = []
     # Iterate over the cols and create a new DataFrame for each column
     for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
         d = scores_df[col].reset_index(drop=True)
         d["task"] = col
         dfs.append(d)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
     # Filter the DataFrame based on the specified metrics
     df = df[df["task"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
     filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}