danielz02
commited on
Change repository names
Browse files- .idea/.gitignore +8 -0
- .idea/aws.xml +11 -0
- app.py +21 -20
- src/display/utils.py +14 -8
- src/envs.py +5 -5
- src/submission/submit.py +5 -2
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
.idea/aws.xml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="accountSettings">
|
4 |
+
<option name="activeRegion" value="us-east-1" />
|
5 |
+
<option name="recentlyUsedRegions">
|
6 |
+
<list>
|
7 |
+
<option value="us-east-1" />
|
8 |
+
</list>
|
9 |
+
</option>
|
10 |
+
</component>
|
11 |
+
</project>
|
app.py
CHANGED
@@ -33,6 +33,7 @@ from src.submission.submit import add_new_eval
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
|
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
@@ -48,7 +49,6 @@ try:
|
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
-
|
52 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
leaderboard_df = original_df.copy()
|
54 |
|
@@ -61,13 +61,13 @@ leaderboard_df = original_df.copy()
|
|
61 |
|
62 |
# Searching and filtering
|
63 |
def update_table(
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
):
|
72 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
73 |
filtered_df = filter_queries(query, filtered_df)
|
@@ -87,7 +87,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
87 |
# We use COLS to maintain sorting
|
88 |
filtered_df = df[
|
89 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
90 |
-
|
91 |
return filtered_df
|
92 |
|
93 |
|
@@ -111,7 +111,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
111 |
|
112 |
|
113 |
def filter_models(
|
114 |
-
|
115 |
) -> pd.DataFrame:
|
116 |
# Show all models
|
117 |
if show_deleted:
|
@@ -167,7 +167,7 @@ with demo:
|
|
167 |
value=False, label="Show gated/private/deleted models", interactive=True
|
168 |
)
|
169 |
with gr.Column(min_width=320):
|
170 |
-
#with gr.Box(elem_id="box-filter"):
|
171 |
filter_columns_type = gr.CheckboxGroup(
|
172 |
label="Model types",
|
173 |
choices=[t.to_str() for t in ModelType],
|
@@ -195,13 +195,13 @@ with demo:
|
|
195 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
196 |
+ shown_columns.value
|
197 |
+ [AutoEvalColumn.dummy.name]
|
198 |
-
|
199 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
200 |
datatype=TYPES,
|
201 |
elem_id="leaderboard-table",
|
202 |
interactive=False,
|
203 |
visible=True,
|
204 |
-
column_widths=["2%", "33%"]
|
205 |
)
|
206 |
|
207 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
@@ -224,7 +224,8 @@ with demo:
|
|
224 |
],
|
225 |
leaderboard_table,
|
226 |
)
|
227 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
|
|
|
228 |
selector.change(
|
229 |
update_table,
|
230 |
[
|
@@ -250,8 +251,8 @@ with demo:
|
|
250 |
|
251 |
with gr.Column():
|
252 |
with gr.Accordion(
|
253 |
-
|
254 |
-
|
255 |
):
|
256 |
with gr.Row():
|
257 |
finished_eval_table = gr.components.Dataframe(
|
@@ -261,8 +262,8 @@ with demo:
|
|
261 |
row_count=5,
|
262 |
)
|
263 |
with gr.Accordion(
|
264 |
-
|
265 |
-
|
266 |
):
|
267 |
with gr.Row():
|
268 |
running_eval_table = gr.components.Dataframe(
|
@@ -273,8 +274,8 @@ with demo:
|
|
273 |
)
|
274 |
|
275 |
with gr.Accordion(
|
276 |
-
|
277 |
-
|
278 |
):
|
279 |
with gr.Row():
|
280 |
pending_eval_table = gr.components.Dataframe(
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
36 |
+
|
37 |
try:
|
38 |
print(EVAL_REQUESTS_PATH)
|
39 |
snapshot_download(
|
|
|
49 |
except Exception:
|
50 |
restart_space()
|
51 |
|
|
|
52 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
leaderboard_df = original_df.copy()
|
54 |
|
|
|
61 |
|
62 |
# Searching and filtering
|
63 |
def update_table(
|
64 |
+
hidden_df: pd.DataFrame,
|
65 |
+
columns: list,
|
66 |
+
type_query: list,
|
67 |
+
precision_query: str,
|
68 |
+
size_query: list,
|
69 |
+
show_deleted: bool,
|
70 |
+
query: str,
|
71 |
):
|
72 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
73 |
filtered_df = filter_queries(query, filtered_df)
|
|
|
87 |
# We use COLS to maintain sorting
|
88 |
filtered_df = df[
|
89 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
90 |
+
]
|
91 |
return filtered_df
|
92 |
|
93 |
|
|
|
111 |
|
112 |
|
113 |
def filter_models(
|
114 |
+
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
115 |
) -> pd.DataFrame:
|
116 |
# Show all models
|
117 |
if show_deleted:
|
|
|
167 |
value=False, label="Show gated/private/deleted models", interactive=True
|
168 |
)
|
169 |
with gr.Column(min_width=320):
|
170 |
+
# with gr.Box(elem_id="box-filter"):
|
171 |
filter_columns_type = gr.CheckboxGroup(
|
172 |
label="Model types",
|
173 |
choices=[t.to_str() for t in ModelType],
|
|
|
195 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
196 |
+ shown_columns.value
|
197 |
+ [AutoEvalColumn.dummy.name]
|
198 |
+
],
|
199 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
200 |
datatype=TYPES,
|
201 |
elem_id="leaderboard-table",
|
202 |
interactive=False,
|
203 |
visible=True,
|
204 |
+
column_widths=["2%", "33%"]
|
205 |
)
|
206 |
|
207 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
224 |
],
|
225 |
leaderboard_table,
|
226 |
)
|
227 |
+
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
|
228 |
+
deleted_models_visibility]:
|
229 |
selector.change(
|
230 |
update_table,
|
231 |
[
|
|
|
251 |
|
252 |
with gr.Column():
|
253 |
with gr.Accordion(
|
254 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
255 |
+
open=False,
|
256 |
):
|
257 |
with gr.Row():
|
258 |
finished_eval_table = gr.components.Dataframe(
|
|
|
262 |
row_count=5,
|
263 |
)
|
264 |
with gr.Accordion(
|
265 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
266 |
+
open=False,
|
267 |
):
|
268 |
with gr.Row():
|
269 |
running_eval_table = gr.components.Dataframe(
|
|
|
274 |
)
|
275 |
|
276 |
with gr.Accordion(
|
277 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
278 |
+
open=False,
|
279 |
):
|
280 |
with gr.Row():
|
281 |
pending_eval_table = gr.components.Dataframe(
|
src/display/utils.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.about import Tasks
|
7 |
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
@@ -21,13 +22,13 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
dummy: bool = False
|
23 |
|
|
|
24 |
## Leaderboard columns
|
25 |
-
auto_eval_column_dict = []
|
|
|
|
|
26 |
# Init
|
27 |
-
|
28 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
-
#Scores
|
30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
@@ -46,7 +47,8 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
46 |
# We use make dataclass to dynamically fill the scores from Tasks
|
47 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
48 |
|
49 |
-
|
|
|
50 |
@dataclass(frozen=True)
|
51 |
class EvalQueueColumn: # Queue column
|
52 |
model = ColumnContent("model", "markdown", True)
|
@@ -56,12 +58,13 @@ class EvalQueueColumn: # Queue column
|
|
56 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
57 |
status = ColumnContent("status", "str", True)
|
58 |
|
59 |
-
|
|
|
60 |
@dataclass
|
61 |
class ModelDetails:
|
62 |
name: str
|
63 |
display_name: str = ""
|
64 |
-
symbol: str = ""
|
65 |
|
66 |
|
67 |
class ModelType(Enum):
|
@@ -86,11 +89,13 @@ class ModelType(Enum):
|
|
86 |
return ModelType.IFT
|
87 |
return ModelType.Unknown
|
88 |
|
|
|
89 |
class WeightType(Enum):
|
90 |
Adapter = ModelDetails("Adapter")
|
91 |
Original = ModelDetails("Original")
|
92 |
Delta = ModelDetails("Delta")
|
93 |
|
|
|
94 |
class Precision(Enum):
|
95 |
float16 = ModelDetails("float16")
|
96 |
bfloat16 = ModelDetails("bfloat16")
|
@@ -112,6 +117,7 @@ class Precision(Enum):
|
|
112 |
return Precision.qt_GPTQ
|
113 |
return Precision.Unknown
|
114 |
|
|
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
117 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
5 |
|
6 |
from src.display.about import Tasks
|
7 |
|
8 |
+
|
9 |
def fields(raw_class):
|
10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
11 |
|
|
|
22 |
never_hidden: bool = False
|
23 |
dummy: bool = False
|
24 |
|
25 |
+
|
26 |
## Leaderboard columns
|
27 |
+
auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
|
28 |
+
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
|
29 |
+
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]]
|
30 |
# Init
|
31 |
+
# Scores
|
|
|
|
|
|
|
32 |
for task in Tasks:
|
33 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
34 |
# Model information
|
|
|
47 |
# We use make dataclass to dynamically fill the scores from Tasks
|
48 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
49 |
|
50 |
+
|
51 |
+
# For the queue columns in the submission tab
|
52 |
@dataclass(frozen=True)
|
53 |
class EvalQueueColumn: # Queue column
|
54 |
model = ColumnContent("model", "markdown", True)
|
|
|
58 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
59 |
status = ColumnContent("status", "str", True)
|
60 |
|
61 |
+
|
62 |
+
# All the model information that we might need
|
63 |
@dataclass
|
64 |
class ModelDetails:
|
65 |
name: str
|
66 |
display_name: str = ""
|
67 |
+
symbol: str = "" # emoji
|
68 |
|
69 |
|
70 |
class ModelType(Enum):
|
|
|
89 |
return ModelType.IFT
|
90 |
return ModelType.Unknown
|
91 |
|
92 |
+
|
93 |
class WeightType(Enum):
|
94 |
Adapter = ModelDetails("Adapter")
|
95 |
Original = ModelDetails("Original")
|
96 |
Delta = ModelDetails("Delta")
|
97 |
|
98 |
+
|
99 |
class Precision(Enum):
|
100 |
float16 = ModelDetails("float16")
|
101 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
117 |
return Precision.qt_GPTQ
|
118 |
return Precision.Unknown
|
119 |
|
120 |
+
|
121 |
# Column selection
|
122 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
123 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
src/envs.py
CHANGED
@@ -5,12 +5,12 @@ from huggingface_hub import HfApi
|
|
5 |
# clone / pull the lmeh eval data
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
7 |
|
8 |
-
OWNER = "
|
9 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
10 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
11 |
-
RESULTS_REPO = f"{OWNER}/results"
|
12 |
|
13 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
5 |
# clone / pull the lmeh eval data
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
7 |
|
8 |
+
OWNER = "AI-Secure"
|
9 |
+
REPO_ID = f"{OWNER}/llm-trustworthy-leaderboard"
|
10 |
+
QUEUE_REPO = f"{OWNER}/llm-trustworthy-leaderboard-requests"
|
11 |
+
RESULTS_REPO = f"{OWNER}/llm-trustworthy-leaderboard-results"
|
12 |
|
13 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
src/submission/submit.py
CHANGED
@@ -14,6 +14,7 @@ from src.submission.check_validity import (
|
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
base_model: str,
|
@@ -45,7 +46,8 @@ def add_new_eval(
|
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN,
|
|
|
49 |
if not base_model_on_hub:
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
@@ -114,5 +116,6 @@ def add_new_eval(
|
|
114 |
os.remove(out_path)
|
115 |
|
116 |
return styled_message(
|
117 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to
|
|
|
118 |
)
|
|
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
17 |
+
|
18 |
def add_new_eval(
|
19 |
model: str,
|
20 |
base_model: str,
|
|
|
46 |
|
47 |
# Is the model on the hub?
|
48 |
if weight_type in ["Delta", "Adapter"]:
|
49 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN,
|
50 |
+
test_tokenizer=True)
|
51 |
if not base_model_on_hub:
|
52 |
return styled_error(f'Base model "{base_model}" {error}')
|
53 |
|
|
|
116 |
os.remove(out_path)
|
117 |
|
118 |
return styled_message(
|
119 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to "
|
120 |
+
"show in the PENDING list."
|
121 |
)
|