danielz02 commited on
Commit
1989939
1 Parent(s): b2202ce

Change repository names

Browse files
Files changed (6) hide show
  1. .idea/.gitignore +8 -0
  2. .idea/aws.xml +11 -0
  3. app.py +21 -20
  4. src/display/utils.py +14 -8
  5. src/envs.py +5 -5
  6. src/submission/submit.py +5 -2
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/aws.xml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="accountSettings">
4
+ <option name="activeRegion" value="us-east-1" />
5
+ <option name="recentlyUsedRegions">
6
+ <list>
7
+ <option value="us-east-1" />
8
+ </list>
9
+ </option>
10
+ </component>
11
+ </project>
app.py CHANGED
@@ -33,6 +33,7 @@ from src.submission.submit import add_new_eval
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
 
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
@@ -48,7 +49,6 @@ try:
48
  except Exception:
49
  restart_space()
50
 
51
-
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
  leaderboard_df = original_df.copy()
54
 
@@ -61,13 +61,13 @@ leaderboard_df = original_df.copy()
61
 
62
  # Searching and filtering
63
  def update_table(
64
- hidden_df: pd.DataFrame,
65
- columns: list,
66
- type_query: list,
67
- precision_query: str,
68
- size_query: list,
69
- show_deleted: bool,
70
- query: str,
71
  ):
72
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
73
  filtered_df = filter_queries(query, filtered_df)
@@ -87,7 +87,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
87
  # We use COLS to maintain sorting
88
  filtered_df = df[
89
  always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
90
- ]
91
  return filtered_df
92
 
93
 
@@ -111,7 +111,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
111
 
112
 
113
  def filter_models(
114
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
115
  ) -> pd.DataFrame:
116
  # Show all models
117
  if show_deleted:
@@ -167,7 +167,7 @@ with demo:
167
  value=False, label="Show gated/private/deleted models", interactive=True
168
  )
169
  with gr.Column(min_width=320):
170
- #with gr.Box(elem_id="box-filter"):
171
  filter_columns_type = gr.CheckboxGroup(
172
  label="Model types",
173
  choices=[t.to_str() for t in ModelType],
@@ -195,13 +195,13 @@ with demo:
195
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
196
  + shown_columns.value
197
  + [AutoEvalColumn.dummy.name]
198
- ],
199
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
201
  elem_id="leaderboard-table",
202
  interactive=False,
203
  visible=True,
204
- column_widths=["2%", "33%"]
205
  )
206
 
207
  # Dummy leaderboard for handling the case when the user uses backspace key
@@ -224,7 +224,8 @@ with demo:
224
  ],
225
  leaderboard_table,
226
  )
227
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
 
228
  selector.change(
229
  update_table,
230
  [
@@ -250,8 +251,8 @@ with demo:
250
 
251
  with gr.Column():
252
  with gr.Accordion(
253
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
254
- open=False,
255
  ):
256
  with gr.Row():
257
  finished_eval_table = gr.components.Dataframe(
@@ -261,8 +262,8 @@ with demo:
261
  row_count=5,
262
  )
263
  with gr.Accordion(
264
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
265
- open=False,
266
  ):
267
  with gr.Row():
268
  running_eval_table = gr.components.Dataframe(
@@ -273,8 +274,8 @@ with demo:
273
  )
274
 
275
  with gr.Accordion(
276
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
277
- open=False,
278
  ):
279
  with gr.Row():
280
  pending_eval_table = gr.components.Dataframe(
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
36
+
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
 
49
  except Exception:
50
  restart_space()
51
 
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
  leaderboard_df = original_df.copy()
54
 
 
61
 
62
  # Searching and filtering
63
  def update_table(
64
+ hidden_df: pd.DataFrame,
65
+ columns: list,
66
+ type_query: list,
67
+ precision_query: str,
68
+ size_query: list,
69
+ show_deleted: bool,
70
+ query: str,
71
  ):
72
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
73
  filtered_df = filter_queries(query, filtered_df)
 
87
  # We use COLS to maintain sorting
88
  filtered_df = df[
89
  always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
90
+ ]
91
  return filtered_df
92
 
93
 
 
111
 
112
 
113
  def filter_models(
114
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
115
  ) -> pd.DataFrame:
116
  # Show all models
117
  if show_deleted:
 
167
  value=False, label="Show gated/private/deleted models", interactive=True
168
  )
169
  with gr.Column(min_width=320):
170
+ # with gr.Box(elem_id="box-filter"):
171
  filter_columns_type = gr.CheckboxGroup(
172
  label="Model types",
173
  choices=[t.to_str() for t in ModelType],
 
195
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
196
  + shown_columns.value
197
  + [AutoEvalColumn.dummy.name]
198
+ ],
199
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
201
  elem_id="leaderboard-table",
202
  interactive=False,
203
  visible=True,
204
+ column_widths=["2%", "33%"]
205
  )
206
 
207
  # Dummy leaderboard for handling the case when the user uses backspace key
 
224
  ],
225
  leaderboard_table,
226
  )
227
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
228
+ deleted_models_visibility]:
229
  selector.change(
230
  update_table,
231
  [
 
251
 
252
  with gr.Column():
253
  with gr.Accordion(
254
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
255
+ open=False,
256
  ):
257
  with gr.Row():
258
  finished_eval_table = gr.components.Dataframe(
 
262
  row_count=5,
263
  )
264
  with gr.Accordion(
265
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
266
+ open=False,
267
  ):
268
  with gr.Row():
269
  running_eval_table = gr.components.Dataframe(
 
274
  )
275
 
276
  with gr.Accordion(
277
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
278
+ open=False,
279
  ):
280
  with gr.Row():
281
  pending_eval_table = gr.components.Dataframe(
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.display.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -21,13 +22,13 @@ class ColumnContent:
21
  never_hidden: bool = False
22
  dummy: bool = False
23
 
 
24
  ## Leaderboard columns
25
- auto_eval_column_dict = []
 
 
26
  # Init
27
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
@@ -46,7 +47,8 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
46
  # We use make dataclass to dynamically fill the scores from Tasks
47
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
48
 
49
- ## For the queue columns in the submission tab
 
50
  @dataclass(frozen=True)
51
  class EvalQueueColumn: # Queue column
52
  model = ColumnContent("model", "markdown", True)
@@ -56,12 +58,13 @@ class EvalQueueColumn: # Queue column
56
  weight_type = ColumnContent("weight_type", "str", "Original")
57
  status = ColumnContent("status", "str", True)
58
 
59
- ## All the model information that we might need
 
60
  @dataclass
61
  class ModelDetails:
62
  name: str
63
  display_name: str = ""
64
- symbol: str = "" # emoji
65
 
66
 
67
  class ModelType(Enum):
@@ -86,11 +89,13 @@ class ModelType(Enum):
86
  return ModelType.IFT
87
  return ModelType.Unknown
88
 
 
89
  class WeightType(Enum):
90
  Adapter = ModelDetails("Adapter")
91
  Original = ModelDetails("Original")
92
  Delta = ModelDetails("Delta")
93
 
 
94
  class Precision(Enum):
95
  float16 = ModelDetails("float16")
96
  bfloat16 = ModelDetails("bfloat16")
@@ -112,6 +117,7 @@ class Precision(Enum):
112
  return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
5
 
6
  from src.display.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
22
  never_hidden: bool = False
23
  dummy: bool = False
24
 
25
+
26
  ## Leaderboard columns
27
+ auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
28
+ ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
29
+ ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]]
30
  # Init
31
+ # Scores
 
 
 
32
  for task in Tasks:
33
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
  # Model information
 
47
  # We use make dataclass to dynamically fill the scores from Tasks
48
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
49
 
50
+
51
+ # For the queue columns in the submission tab
52
  @dataclass(frozen=True)
53
  class EvalQueueColumn: # Queue column
54
  model = ColumnContent("model", "markdown", True)
 
58
  weight_type = ColumnContent("weight_type", "str", "Original")
59
  status = ColumnContent("status", "str", True)
60
 
61
+
62
+ # All the model information that we might need
63
  @dataclass
64
  class ModelDetails:
65
  name: str
66
  display_name: str = ""
67
+ symbol: str = "" # emoji
68
 
69
 
70
  class ModelType(Enum):
 
89
  return ModelType.IFT
90
  return ModelType.Unknown
91
 
92
+
93
  class WeightType(Enum):
94
  Adapter = ModelDetails("Adapter")
95
  Original = ModelDetails("Original")
96
  Delta = ModelDetails("Delta")
97
 
98
+
99
  class Precision(Enum):
100
  float16 = ModelDetails("float16")
101
  bfloat16 = ModelDetails("bfloat16")
 
117
  return Precision.qt_GPTQ
118
  return Precision.Unknown
119
 
120
+
121
  # Column selection
122
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
123
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
src/envs.py CHANGED
@@ -5,12 +5,12 @@ from huggingface_hub import HfApi
5
  # clone / pull the lmeh eval data
6
  TOKEN = os.environ.get("TOKEN", None)
7
 
8
- OWNER = "demo-leaderboard"
9
- REPO_ID = f"{OWNER}/leaderboard"
10
- QUEUE_REPO = f"{OWNER}/requests"
11
- RESULTS_REPO = f"{OWNER}/results"
12
 
13
- CACHE_PATH=os.getenv("HF_HOME", ".")
14
 
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
5
  # clone / pull the lmeh eval data
6
  TOKEN = os.environ.get("TOKEN", None)
7
 
8
+ OWNER = "AI-Secure"
9
+ REPO_ID = f"{OWNER}/llm-trustworthy-leaderboard"
10
+ QUEUE_REPO = f"{OWNER}/llm-trustworthy-leaderboard-requests"
11
+ RESULTS_REPO = f"{OWNER}/llm-trustworthy-leaderboard-results"
12
 
13
+ CACHE_PATH = os.getenv("HF_HOME", ".")
14
 
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/submission/submit.py CHANGED
@@ -14,6 +14,7 @@ from src.submission.check_validity import (
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
  model: str,
19
  base_model: str,
@@ -45,7 +46,8 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
 
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
@@ -114,5 +116,6 @@ def add_new_eval(
114
  os.remove(out_path)
115
 
116
  return styled_message(
117
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
 
118
  )
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
+
18
  def add_new_eval(
19
  model: str,
20
  base_model: str,
 
46
 
47
  # Is the model on the hub?
48
  if weight_type in ["Delta", "Adapter"]:
49
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN,
50
+ test_tokenizer=True)
51
  if not base_model_on_hub:
52
  return styled_error(f'Base model "{base_model}" {error}')
53
 
 
116
  os.remove(out_path)
117
 
118
  return styled_message(
119
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to "
120
+ "show in the PENDING list."
121
  )