Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- app.py +191 -0
- data/eval_board.csv +4 -0
- requirements.txt +4 -0
- src/__pycache__/css_html.cpython-38.pyc +0 -0
- src/__pycache__/demo.cpython-38.pyc +0 -0
- src/__pycache__/utils.cpython-38.pyc +0 -0
- src/css_html.py +79 -0
- src/demo.py +44 -0
- src/utils.py +65 -0
app.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from src.css_html import custom_css
|
5 |
+
from src.utils import (
|
6 |
+
AutoEvalColumn,
|
7 |
+
fields,
|
8 |
+
make_clickable_names,
|
9 |
+
make_plot_data
|
10 |
+
)
|
11 |
+
from src.demo import (
|
12 |
+
generate,
|
13 |
+
random_examples,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
|
18 |
+
MAX_MAX_NEW_TOKENS = 1024
|
19 |
+
DEFAULT_MAX_NEW_TOKENS = 512
|
20 |
+
|
21 |
+
|
22 |
+
df = pd.read_csv("data/eval_board.csv")
|
23 |
+
|
24 |
+
COLS = [c.name for c in fields(AutoEvalColumn)]
|
25 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
26 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
27 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
28 |
+
|
29 |
+
|
30 |
+
def select_columns(df, columns):
|
31 |
+
always_here_cols = [
|
32 |
+
AutoEvalColumn.model.name
|
33 |
+
]
|
34 |
+
# We use COLS to maintain sorting
|
35 |
+
filtered_df = df[
|
36 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
37 |
+
]
|
38 |
+
return filtered_df
|
39 |
+
|
40 |
+
|
41 |
+
df["pure_name"] = df['Models']
|
42 |
+
df = make_clickable_names(df)
|
43 |
+
demo = gr.Blocks(css=custom_css)
|
44 |
+
|
45 |
+
with demo:
|
46 |
+
with gr.Row():
|
47 |
+
gr.Markdown(
|
48 |
+
"""<div style="text-align: center;"><h1> 🤖ConvRe🤯 <span style='color: #e6b800;'>Leaderboard</span></h1></div>\
|
49 |
+
<br>\
|
50 |
+
<p> 🤖ConvRe🤯 is the benchmark proposed in our EMNLP 2023 paper: <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"> An Investigation of LLMs’ Inefficacy in Understanding Converse Relations</a>. It aims to evaluate LLMs' ability on understanding converse relations. Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged. For example, the triple (x, has part, y) is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
|
51 |
+
|
52 |
+
The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
|
53 |
+
</p>""",
|
54 |
+
elem_classes="markdown-text",
|
55 |
+
)
|
56 |
+
|
57 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
58 |
+
with gr.TabItem("🔢 Data", id=0):
|
59 |
+
with gr.Accordion("➡️ See All Columns", open=False):
|
60 |
+
shown_columns = gr.CheckboxGroup(
|
61 |
+
choices=[
|
62 |
+
c for c in COLS if c not in [AutoEvalColumn.model.name]
|
63 |
+
],
|
64 |
+
value=[
|
65 |
+
c for c in COLS_LITE if c not in [AutoEvalColumn.model.name]
|
66 |
+
],
|
67 |
+
label="",
|
68 |
+
elem_id="column-select",
|
69 |
+
interactive=True
|
70 |
+
)
|
71 |
+
leaderboard_df_re2text = gr.components.Dataframe(
|
72 |
+
value=df[
|
73 |
+
[
|
74 |
+
AutoEvalColumn.model.name,
|
75 |
+
] + shown_columns.value
|
76 |
+
],
|
77 |
+
headers=[
|
78 |
+
AutoEvalColumn.model.name,
|
79 |
+
] + shown_columns.value,
|
80 |
+
datatype=TYPES,
|
81 |
+
elem_id="leaderboard-table",
|
82 |
+
interactive=False,
|
83 |
+
)
|
84 |
+
|
85 |
+
hidden_leaderboard_df_re2text = gr.components.DataFrame(
|
86 |
+
value=df,
|
87 |
+
headers=COLS,
|
88 |
+
datatype=["str" for _ in range(len(COLS))],
|
89 |
+
visible=False,
|
90 |
+
)
|
91 |
+
|
92 |
+
shown_columns.change(
|
93 |
+
select_columns,
|
94 |
+
[hidden_leaderboard_df_re2text, shown_columns],
|
95 |
+
leaderboard_df_re2text
|
96 |
+
)
|
97 |
+
|
98 |
+
with gr.TabItem("📊 Plot", id=1):
|
99 |
+
with gr.Row():
|
100 |
+
with gr.Column():
|
101 |
+
gr.LinePlot(
|
102 |
+
make_plot_data(df, task="Re2Text"),
|
103 |
+
x="Setting",
|
104 |
+
y="Accuracy",
|
105 |
+
color="Symbol",
|
106 |
+
title="Re2Text",
|
107 |
+
y_lim=[0, 100],
|
108 |
+
x_label_angle=0,
|
109 |
+
height=400,
|
110 |
+
width=500,
|
111 |
+
)
|
112 |
+
|
113 |
+
with gr.Column():
|
114 |
+
gr.LinePlot(
|
115 |
+
make_plot_data(df, task="Text2Re"),
|
116 |
+
x="Setting",
|
117 |
+
y="Accuracy",
|
118 |
+
color="Symbol",
|
119 |
+
title="Text2Re",
|
120 |
+
y_lim=[0, 100],
|
121 |
+
x_label_angle=0,
|
122 |
+
height=400,
|
123 |
+
width=500,
|
124 |
+
)
|
125 |
+
|
126 |
+
with gr.Column():
|
127 |
+
gr.Markdown(
|
128 |
+
"""<div style="text-align: center;"><h2> 🤖ConvRe🤯 Demo </h2></div>\
|
129 |
+
<br>\
|
130 |
+
""",
|
131 |
+
elem_classes="markdown-text",
|
132 |
+
)
|
133 |
+
|
134 |
+
output_box = gr.Textbox(lines=10, max_lines=10, label="ChatBot")
|
135 |
+
|
136 |
+
input_box = gr.Textbox(lines=12, max_lines=12, label="Input")
|
137 |
+
|
138 |
+
with gr.Row():
|
139 |
+
gr.ClearButton([input_box, output_box])
|
140 |
+
gr.Button("Submit")
|
141 |
+
|
142 |
+
with gr.Row():
|
143 |
+
re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
|
144 |
+
re2text_easy_btn.click(
|
145 |
+
fn=random_examples,
|
146 |
+
inputs=gr.Text("re2text-easy", visible=False),
|
147 |
+
outputs = input_box,
|
148 |
+
api_name="re2text-easy"
|
149 |
+
)
|
150 |
+
|
151 |
+
re2text_hard_btn = gr.Button("Random Re2Text Hard Example 🤯")
|
152 |
+
re2text_hard_btn.click(
|
153 |
+
fn=random_examples,
|
154 |
+
inputs=gr.Text("re2text-hard", visible=False),
|
155 |
+
outputs=input_box,
|
156 |
+
)
|
157 |
+
|
158 |
+
text2re_easy_btn = gr.Button("Random Text2Re Easy Example 😄")
|
159 |
+
text2re_easy_btn.click(
|
160 |
+
fn=random_examples,
|
161 |
+
inputs=gr.Text("text2re-easy", visible=False),
|
162 |
+
outputs = input_box,
|
163 |
+
)
|
164 |
+
|
165 |
+
text2re_hard_btn = gr.Button("Random Text2Re Hard Example 🤯")
|
166 |
+
text2re_hard_btn.click(
|
167 |
+
fn=random_examples,
|
168 |
+
inputs=gr.Text("text2re-hard", visible=False),
|
169 |
+
outputs = input_box,
|
170 |
+
)
|
171 |
+
|
172 |
+
with gr.Accordion("Additional Inputs", open=False):
|
173 |
+
gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6),
|
174 |
+
|
175 |
+
gr.Slider(
|
176 |
+
label="Max new tokens",
|
177 |
+
minimum=1,
|
178 |
+
maximum=MAX_MAX_NEW_TOKENS,
|
179 |
+
step=1,
|
180 |
+
value=DEFAULT_MAX_NEW_TOKENS,
|
181 |
+
),
|
182 |
+
|
183 |
+
gr.Slider(
|
184 |
+
label="Temperature",
|
185 |
+
minimum=0,
|
186 |
+
maximum=4.0,
|
187 |
+
step=0.05,
|
188 |
+
value=0,
|
189 |
+
),
|
190 |
+
|
191 |
+
demo.launch()
|
data/eval_board.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Links
|
2 |
+
llama-2-7b-chat-hf,0,0,0,0,0,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
3 |
+
qwen-7b-chat,0,0,0,0,0,https://huggingface.co/Qwen/Qwen-7B-Chat
|
4 |
+
internlm-7b-chat,0,0,0,0,0,https://huggingface.co/internlm/internlm-chat-7b
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.47.1
|
2 |
+
torch==2.0.0
|
3 |
+
transformers==4.34.0
|
4 |
+
datasets==2.14.5
|
src/__pycache__/css_html.cpython-38.pyc
ADDED
Binary file (1.44 kB). View file
|
|
src/__pycache__/demo.cpython-38.pyc
ADDED
Binary file (1.22 kB). View file
|
|
src/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (2.3 kB). View file
|
|
src/css_html.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
|
2 |
+
custom_css = """
|
3 |
+
#changelog-text {
|
4 |
+
font-size: 16px !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
#changelog-text h2 {
|
8 |
+
font-size: 18px !important;
|
9 |
+
}
|
10 |
+
|
11 |
+
.markdown-text {
|
12 |
+
font-size: 16px !important;
|
13 |
+
}
|
14 |
+
|
15 |
+
#models-to-add-text {
|
16 |
+
font-size: 18px !important;
|
17 |
+
}
|
18 |
+
|
19 |
+
#citation-button span {
|
20 |
+
font-size: 16px !important;
|
21 |
+
}
|
22 |
+
|
23 |
+
#citation-button textarea {
|
24 |
+
font-size: 16px !important;
|
25 |
+
}
|
26 |
+
|
27 |
+
#citation-button > label > button {
|
28 |
+
margin: 6px;
|
29 |
+
transform: scale(1.3);
|
30 |
+
}
|
31 |
+
|
32 |
+
#leaderboard-table {
|
33 |
+
margin-top: 15px
|
34 |
+
}
|
35 |
+
|
36 |
+
#leaderboard-table-lite {
|
37 |
+
margin-top: 15px
|
38 |
+
}
|
39 |
+
|
40 |
+
#search-bar-table-box > div:first-child {
|
41 |
+
background: none;
|
42 |
+
border: none;
|
43 |
+
}
|
44 |
+
|
45 |
+
#search-bar {
|
46 |
+
padding: 0px;
|
47 |
+
}
|
48 |
+
|
49 |
+
/* Hides the final AutoEvalColumn */
|
50 |
+
#llm-benchmark-tab-table table td:last-child,
|
51 |
+
#llm-benchmark-tab-table table th:last-child {
|
52 |
+
display: none;
|
53 |
+
}
|
54 |
+
|
55 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
56 |
+
table td:first-child,
|
57 |
+
table th:first-child {
|
58 |
+
max-width: 400px;
|
59 |
+
overflow: auto;
|
60 |
+
white-space: nowrap;
|
61 |
+
}
|
62 |
+
|
63 |
+
.tab-buttons button {
|
64 |
+
font-size: 20px;
|
65 |
+
}
|
66 |
+
|
67 |
+
#scale-logo {
|
68 |
+
border-style: none !important;
|
69 |
+
box-shadow: none;
|
70 |
+
display: block;
|
71 |
+
margin-left: auto;
|
72 |
+
margin-right: auto;
|
73 |
+
max-width: 600px;
|
74 |
+
}
|
75 |
+
|
76 |
+
#scale-logo .download {
|
77 |
+
display: none;
|
78 |
+
}
|
79 |
+
"""
|
src/demo.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
from threading import Thread
|
4 |
+
from typing import Iterable
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from huggingface_hub import HfApi
|
8 |
+
from datasets import load_dataset
|
9 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
10 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
11 |
+
|
12 |
+
|
13 |
+
TOKEN = os.environ.get("HF_TOKEN", None)
|
14 |
+
|
15 |
+
|
16 |
+
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
17 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
18 |
+
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
|
19 |
+
|
20 |
+
|
21 |
+
type2dataset = {
|
22 |
+
"re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=True, split="prompt1"),
|
23 |
+
"re2text-hard": load_dataset('3B-Group/ConvRe', "en-re2text", token=True, split="prompt4"),
|
24 |
+
"text2re-easy": load_dataset('3B-Group/ConvRe', "en-text2re", token=True, split="prompt1"),
|
25 |
+
"text2re-hard": load_dataset('3B-Group/ConvRe', "en-text2re", token=True, split="prompt3")
|
26 |
+
}
|
27 |
+
|
28 |
+
# type2dataset = {}
|
29 |
+
|
30 |
+
|
31 |
+
def generate():
|
32 |
+
return "1"
|
33 |
+
|
34 |
+
|
35 |
+
def random_examples(dataset_key) -> str:
|
36 |
+
# target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
|
37 |
+
target_dataset = type2dataset[dataset_key]
|
38 |
+
|
39 |
+
idx = random.randint(0, len(target_dataset) - 1)
|
40 |
+
item = target_dataset[idx]
|
41 |
+
return item['query']
|
42 |
+
|
43 |
+
|
44 |
+
|
src/utils.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
displayed_by_default: bool
|
10 |
+
hidden: bool = False
|
11 |
+
|
12 |
+
|
13 |
+
def fields(raw_class):
|
14 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
15 |
+
|
16 |
+
|
17 |
+
@dataclass(frozen=True)
|
18 |
+
class AutoEvalColumn: # Auto evals column
|
19 |
+
model = ColumnContent("Models", "markdown", True)
|
20 |
+
re2text_easy = ColumnContent("Re2Text-Easy", "number", True)
|
21 |
+
text2re_easy = ColumnContent("Text2Re-Easy", "number", True)
|
22 |
+
re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
|
23 |
+
text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
|
24 |
+
avg = ColumnContent("Avg", "number", True)
|
25 |
+
|
26 |
+
link = ColumnContent("Links", "str", False)
|
27 |
+
|
28 |
+
|
29 |
+
def model_hyperlink(link, model_name):
|
30 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
31 |
+
|
32 |
+
|
33 |
+
def make_clickable_names(df):
|
34 |
+
df["Models"] = df.apply(
|
35 |
+
lambda row: model_hyperlink(row["Links"], row["Models"]), axis=1
|
36 |
+
)
|
37 |
+
return df
|
38 |
+
|
39 |
+
|
40 |
+
def make_plot_data(df, task):
|
41 |
+
c = []
|
42 |
+
x = []
|
43 |
+
y = []
|
44 |
+
|
45 |
+
for i in df.index:
|
46 |
+
c.append(df.loc[i, "pure_name"])
|
47 |
+
x.append(f"{task}-Easy")
|
48 |
+
y.append(df.loc[i, f"{task}-Easy"])
|
49 |
+
|
50 |
+
c.append(df.loc[i, "pure_name"])
|
51 |
+
x.append(f"{task}-Hard")
|
52 |
+
y.append(df.loc[i, f"{task}-Hard"])
|
53 |
+
|
54 |
+
data = pd.DataFrame(
|
55 |
+
{
|
56 |
+
"Symbol": c,
|
57 |
+
"Setting": x,
|
58 |
+
"Accuracy": y,
|
59 |
+
}
|
60 |
+
)
|
61 |
+
|
62 |
+
return data
|
63 |
+
|
64 |
+
|
65 |
+
|