pminervini commited on
Commit
1ff551c
2 Parent(s): ba4f18b 7993384

Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main

Browse files
app.py CHANGED
@@ -56,21 +56,14 @@ def restart_space():
56
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
57
 
58
 
59
- def init_space(update_model_type_with_open_llm=True):
60
  dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
61
 
62
  if socket.gethostname() not in {'neuromancer'}:
63
  # sync model_type with open-llm-leaderboard
64
  ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
65
  ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
66
- # if EVAL_REQUESTS_PATH_OPEN_LLM == '' then we will not update model_type with open-llm-leaderbaord
67
- if update_model_type_with_open_llm:
68
- from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
69
- ui_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", tqdm_class=None, etag_timeout=30)
70
- else:
71
- EVAL_REQUESTS_PATH_OPEN_LLM = ""
72
-
73
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EVAL_REQUESTS_PATH_OPEN_LLM, COLS, BENCHMARK_COLS)
74
 
75
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
76
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
56
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
57
 
58
 
59
+ def init_space():
60
  dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
61
 
62
  if socket.gethostname() not in {'neuromancer'}:
63
  # sync model_type with open-llm-leaderboard
64
  ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
65
  ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
66
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
67
 
68
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
69
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
cli/sync-open-llm-cli.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import glob
4
+
5
+ from tqdm import tqdm
6
+ from huggingface_hub import HfApi, snapshot_download
7
+ from src.backend.manage_requests import EvalRequest
8
+ from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND_SYNC
9
+ from src.envs import QUEUE_REPO, API
10
+ from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
11
+ from src.utils import my_snapshot_download
12
+
13
+ def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
14
+ for i in range(10):
15
+ try:
16
+ set_eval_request(api=api, json_filepath=json_filepath, hf_repo=hf_repo, local_dir=local_dir)
17
+ return
18
+ except Exception:
19
+ time.sleep(60)
20
+ return
21
+
22
+
23
+ def set_eval_request(api: HfApi, json_filepath: str, hf_repo: str, local_dir: str):
24
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
25
+
26
+ with open(json_filepath) as fp:
27
+ data = json.load(fp)
28
+
29
+ with open(json_filepath, "w") as f:
30
+ f.write(json.dumps(data))
31
+
32
+ api.upload_file(path_or_fileobj=json_filepath, path_in_repo=json_filepath.replace(local_dir, ""),
33
+ repo_id=hf_repo, repo_type="dataset")
34
+
35
+
36
+ def get_request_file_for_model(data, requests_path):
37
+ model_name = data["model"]
38
+ precision = data["precision"]
39
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED and RUNNING"""
40
+ request_files = os.path.join(
41
+ requests_path,
42
+ f"{model_name}_eval_request_*.json",
43
+ )
44
+ request_files = glob.glob(request_files)
45
+
46
+ # Select correct request file (precision)
47
+ request_file = ""
48
+ request_files = sorted(request_files, reverse=True)
49
+
50
+ for tmp_request_file in request_files:
51
+ with open(tmp_request_file, "r") as f:
52
+ req_content = json.load(f)
53
+ if req_content["precision"] == precision.split(".")[-1]:
54
+ request_file = tmp_request_file
55
+ return request_file
56
+
57
+ def update_model_type(data, requests_path):
58
+ open_llm_request_file = get_request_file_for_model(data, requests_path)
59
+
60
+ try:
61
+ with open(open_llm_request_file, "r") as f:
62
+ open_llm_request = json.load(f)
63
+ data["model_type"] = open_llm_request["model_type"]
64
+ return True, data
65
+ except:
66
+ return False, data
67
+
68
+
69
+ def read_and_write_json_files(directory, requests_path_open_llm):
70
+ # Walk through the directory
71
+ for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
72
+ for file in files:
73
+ # Check if the file is a JSON file
74
+ if file.endswith('.json'):
75
+ file_path = os.path.join(subdir, file)
76
+ # Open and read the JSON file
77
+ with open(file_path, 'r') as json_file:
78
+ data = json.load(json_file)
79
+ sucess, data = update_model_type(data, requests_path_open_llm)
80
+ if sucess:
81
+ with open(file_path, 'w') as json_file:
82
+ json.dump(data, json_file)
83
+ my_set_eval_request(api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC)
84
+
85
+
86
+
87
+
88
+ if __name__ == "__main__":
89
+ my_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, revision="main", local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", max_workers=60)
90
+ my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC, repo_type="dataset", max_workers=60)
91
+ read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)
src/backend/envs.py CHANGED
@@ -59,6 +59,7 @@ class Tasks(Enum):
59
 
60
 
61
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
62
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
63
 
64
  DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
 
59
 
60
 
61
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
62
+ EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
63
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
64
 
65
  DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
src/leaderboard/read_evals.py CHANGED
@@ -128,18 +128,6 @@ class EvalResult:
128
  except Exception as e:
129
  print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
130
 
131
- def update_model_type_with_open_llm_request_file(self, open_llm_requests_path):
132
- """Finds the relevant request file for the current model and updates info with it"""
133
- request_file = get_request_file_for_model_open_llm(open_llm_requests_path, self.full_model, self.precision.value.name)
134
-
135
- if request_file:
136
- try:
137
- with open(request_file, "r") as f:
138
- request = json.load(f)
139
- self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
140
- except Exception as e:
141
- pass
142
-
143
  def is_complete(self) -> bool:
144
  for task in Tasks:
145
  if task.value.benchmark not in self.results:
@@ -216,10 +204,23 @@ def get_request_file_for_model_open_llm(requests_path, model_name, precision):
216
  request_file = tmp_request_file
217
  return request_file
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  def get_raw_eval_results(results_path: str,
221
  requests_path: str,
222
- requests_path_open_llm: Optional[str] = None,
223
  is_backend: bool = False) -> list[EvalResult]:
224
  """From the path of the results folder root, extract all needed info for results"""
225
  model_result_filepaths = []
@@ -243,8 +244,6 @@ def get_raw_eval_results(results_path: str,
243
  # Creation of result
244
  eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
245
  eval_result.update_with_request_file(requests_path)
246
- if requests_path_open_llm is not None:
247
- eval_result.update_model_type_with_open_llm_request_file(requests_path_open_llm)
248
  # Store results of same eval together
249
  eval_name = eval_result.eval_name
250
  if eval_name in eval_results.keys():
 
128
  except Exception as e:
129
  print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def is_complete(self) -> bool:
132
  for task in Tasks:
133
  if task.value.benchmark not in self.results:
 
204
  request_file = tmp_request_file
205
  return request_file
206
 
207
+ def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
208
+ """Finds the relevant request file for the current model and updates info with it"""
209
+ request_file = get_request_file_for_model_open_llm(open_llm_requests_path, result.full_model, result.precision.value.name)
210
+
211
+ if request_file:
212
+ try:
213
+ with open(request_file, "r") as f:
214
+ request = json.load(f)
215
+ open_llm_model_type = request.get("model_type", "Unknown")
216
+ if open_llm_model_type != "Unknown":
217
+ result.model_type = ModelType.from_str(open_llm_model_type)
218
+ except Exception as e:
219
+ pass
220
+ return result
221
 
222
  def get_raw_eval_results(results_path: str,
223
  requests_path: str,
 
224
  is_backend: bool = False) -> list[EvalResult]:
225
  """From the path of the results folder root, extract all needed info for results"""
226
  model_result_filepaths = []
 
244
  # Creation of result
245
  eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
246
  eval_result.update_with_request_file(requests_path)
 
 
247
  # Store results of same eval together
248
  eval_name = eval_result.eval_name
249
  if eval_name in eval_results.keys():
src/populate.py CHANGED
@@ -1,13 +1,13 @@
1
  import json
2
  import os
3
-
4
  import copy
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.filter_models import filter_models
10
- from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
11
 
12
  from src.backend.envs import Tasks as BackendTasks
13
  from src.display.utils import Tasks
@@ -21,6 +21,9 @@ def get_leaderboard_df(results_path: str,
21
  is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
22
  # Returns a list of EvalResult
23
  raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
 
 
 
24
 
25
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
26
 
 
1
  import json
2
  import os
3
+ from tqdm import tqdm
4
  import copy
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.filter_models import filter_models
10
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_model_type_with_open_llm_request_file
11
 
12
  from src.backend.envs import Tasks as BackendTasks
13
  from src.display.utils import Tasks
 
21
  is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
22
  # Returns a list of EvalResult
23
  raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
24
+ if requests_path_open_llm != "":
25
+ for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
26
+ raw_data[result_idx] = update_model_type_with_open_llm_request_file(raw_data[result_idx], requests_path_open_llm)
27
 
28
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
29