Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
1e768ec
1 Parent(s): e93c18c

fix: fix the data loader

Browse files
app.py CHANGED
@@ -42,7 +42,8 @@ def restart_space():
42
  # restart_space()
43
 
44
  raw_data_qa, original_df_qa = get_leaderboard_df(
45
- EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
 
46
  leaderboard_df = original_df_qa.copy()
47
 
48
  # (
 
42
  # restart_space()
43
 
44
  raw_data_qa, original_df_qa = get_leaderboard_df(
45
+ EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
46
+ print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
47
  leaderboard_df = original_df_qa.copy()
48
 
49
  # (
src/envs.py CHANGED
@@ -17,8 +17,8 @@ RESULTS_REPO = f"{OWNER}/results"
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_requests" # os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_results" #os.path.join(CACHE_PATH, "eval-results")
22
  # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/requests" # os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/toys/toydata/results" #os.path.join(CACHE_PATH, "eval-results")
22
  # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
src/leaderboard/read_evals.py CHANGED
@@ -62,19 +62,22 @@ class FullEvalResult:
62
  results=result_list
63
  )
64
 
65
- def to_dict(self, task='qa', metric='ndcg_at_1') -> List:
66
  """Convert FullEvalResult to a list of dict compatible with our dataframe UI
67
  """
68
  results = defaultdict(dict)
69
  for eval_result in self.results:
70
  if eval_result.metric != metric:
 
71
  continue
72
  if eval_result.task != task:
 
73
  continue
74
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
75
  results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
76
  results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
77
 
 
78
  for result in eval_result.results:
79
  # add result for each domain, language, and dataset
80
  domain = result["domain"]
@@ -136,7 +139,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
136
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
137
  continue
138
  try:
139
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_demo_")[:-7], reverse=True)
140
  except dateutil.parser._parser.ParserError:
141
  files = [files[-1]]
142
 
@@ -152,9 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
152
  eval_result.update_with_request_file(requests_path)
153
  latest_date_str = eval_result.date.replace(":", "-")
154
  model_result_date_str = model_result_filepath.split('/')[-1
155
- ].removeprefix("results_demo_").removesuffix(".json")
156
  if latest_date_str != model_result_date_str:
 
157
  continue
 
158
  eval_name = eval_result.eval_name
159
  eval_results[eval_name] = eval_result
160
 
 
62
  results=result_list
63
  )
64
 
65
+ def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
66
  """Convert FullEvalResult to a list of dict compatible with our dataframe UI
67
  """
68
  results = defaultdict(dict)
69
  for eval_result in self.results:
70
  if eval_result.metric != metric:
71
+ # print(f'result skipped: {metric} != {eval_result.metric}')
72
  continue
73
  if eval_result.task != task:
74
+ # print(f'result skipped: {task} != {eval_result.task}')
75
  continue
76
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
77
  results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
78
  results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
79
 
80
+ print(f'result loaded: {eval_result.eval_name}')
81
  for result in eval_result.results:
82
  # add result for each domain, language, and dataset
83
  domain = result["domain"]
 
139
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
140
  continue
141
  try:
142
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
143
  except dateutil.parser._parser.ParserError:
144
  files = [files[-1]]
145
 
 
155
  eval_result.update_with_request_file(requests_path)
156
  latest_date_str = eval_result.date.replace(":", "-")
157
  model_result_date_str = model_result_filepath.split('/')[-1
158
+ ].removeprefix("results_").removesuffix(".json")
159
  if latest_date_str != model_result_date_str:
160
+ print(f'file skipped: {model_result_filepath}')
161
  continue
162
+ print(f'file loaded: {model_result_filepath}')
163
  eval_name = eval_result.eval_name
164
  eval_results[eval_name] = eval_result
165
 
src/populate.py CHANGED
@@ -12,11 +12,14 @@ from typing import Tuple
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
 
15
  all_data_json = []
16
  for v in raw_data:
17
  all_data_json += v.to_dict(task=task, metric=metric)
18
 
 
19
  df = pd.DataFrame.from_records(all_data_json)
 
20
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
21
  df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
22
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ print(f"raw_data loaded: {len(raw_data)}")
16
  all_data_json = []
17
  for v in raw_data:
18
  all_data_json += v.to_dict(task=task, metric=metric)
19
 
20
+ print(f'records loaded: {len(all_data_json)}')
21
  df = pd.DataFrame.from_records(all_data_json)
22
+ print(f'dataframe created: {df.shape}')
23
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
24
  df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
25
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} RENAMED
File without changes
tests/toydata/test_results/bge-m3/NoReranker/{results_demo_2023-12-21T18-10-08.json → results_2023-12-21T18-10-08.json} RENAMED
File without changes
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/{results_demo_2023-11-21T18-10-08.json → results_2023-11-21T18-10-08.json} RENAMED
File without changes