Trayvon commited on
Commit
6d33199
1 Parent(s): 96e5d6e

Upload 10 files

Browse files
app.py CHANGED
@@ -11,6 +11,7 @@ from src.utils import (
11
  from src.demo import (
12
  generate,
13
  random_examples,
 
14
  )
15
 
16
 
@@ -66,14 +67,21 @@ demo = gr.Blocks(css=custom_css)
66
  with demo:
67
  with gr.Row():
68
  gr.Markdown(
69
- """<div style="text-align: center;"><h1> 🤖ConvRe🤯 <span style='color: #e6b800;'>Leaderboard</span></h1></div>\
70
- <br>\
71
- <p> 🤖ConvRe🤯 is the benchmark proposed in our EMNLP 2023 paper: <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"> An Investigation of LLMs’ Inefficacy in Understanding Converse Relations</a>. It aims to evaluate LLMs' ability on understanding converse relations. Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged. For example, the triple (x, has part, y) is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
72
 
73
- The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
74
- </p>""",
75
  elem_classes="markdown-text",
76
  )
 
 
 
 
 
 
 
 
77
 
78
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
79
  with gr.TabItem("🔢 Data", id=0):
@@ -145,19 +153,48 @@ with demo:
145
  )
146
 
147
  with gr.TabItem("Submit results 🚀", id=3):
148
- gr.Markdown("Submit Here")
 
 
 
 
149
 
150
  with gr.Column():
151
  gr.Markdown(
152
- """<div style="text-align: center;"><h2> 🤖ConvRe🤯 Demo </h2></div>\
153
  <br>\
154
  """,
155
  elem_classes="markdown-text",
156
  )
157
 
158
- output_box = gr.Textbox(lines=10, max_lines=10, label="ChatBot")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- input_box = gr.Textbox(lines=12, max_lines=12, label="Input")
161
 
162
  with gr.Row():
163
  re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
@@ -188,28 +225,13 @@ with demo:
188
  outputs = input_box,
189
  )
190
 
191
- with gr.Accordion("Additional Inputs", open=False):
192
- sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
193
-
194
- gr.Slider(
195
- label="Max new tokens",
196
- minimum=1,
197
- maximum=MAX_MAX_NEW_TOKENS,
198
- step=1,
199
- value=DEFAULT_MAX_NEW_TOKENS,
200
- )
201
-
202
- gr.Slider(
203
- label="Temperature",
204
- minimum=0,
205
- maximum=4.0,
206
- step=0.05,
207
- value=0,
208
- )
209
-
210
  with gr.Row():
211
  gr.ClearButton([input_box, output_box])
212
- submit_btn = gr.Button("Submit")
213
- submit_btn.click(generate, inputs=[input_box, sys_prompt], outputs=[output_box])
 
 
 
 
214
 
215
- demo.launch()
 
11
  from src.demo import (
12
  generate,
13
  random_examples,
14
+ return_ground_truth,
15
  )
16
 
17
 
 
67
  with demo:
68
  with gr.Row():
69
  gr.Markdown(
70
+ """<div align= "center">
71
+ <h1>🤖 ConvRe 🤯 <span style='color: #e6b800;'> Leaderboard</span></h1>
72
+ </div>
73
 
74
+ """,
 
75
  elem_classes="markdown-text",
76
  )
77
+
78
+ gr.Markdown("""🤖**ConvRe**🤯 is the benchmark proposed in our EMNLP 2023 main conference paper: [An Investigation of LLMs’ Inefficacy in Understanding Converse Relations]().
79
+ It aims to evaluate LLMs' ability on understanding converse relations.
80
+ Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged.
81
+ For example, the triple `(x, has part, y)` is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
82
+
83
+ The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
84
+ """, elem_classes="markdown-text")
85
 
86
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
87
  with gr.TabItem("🔢 Data", id=0):
 
153
  )
154
 
155
  with gr.TabItem("Submit results 🚀", id=3):
156
+ gr.Markdown("""<div align= "center">
157
+ <h1>Comming Soon ❤️</span></h1>
158
+ </div>
159
+
160
+ """)
161
 
162
  with gr.Column():
163
  gr.Markdown(
164
+ """<div style="text-align: center;"><h1> 🤖ConvRe🤯 Demo (Llama-2-Chat-7B🦙) </h1></div>\
165
  <br>\
166
  """,
167
  elem_classes="markdown-text",
168
  )
169
 
170
+ output_box = gr.Textbox(lines=10, max_lines=10, label="Llama-2-Chat-7B Answer", interactive=False)
171
+
172
+ input_box = gr.Textbox(lines=12, max_lines=12, label="User Input")
173
+
174
+ ground_truth_display = gr.Textbox("", lines=1, max_lines=1, label="😊Correct Answer😊", interactive=False)
175
+
176
+ with gr.Column():
177
+
178
+
179
+ with gr.Accordion("Additional Inputs", open=False):
180
+ sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
181
+
182
+ max_new_tokens=gr.Slider(
183
+ label="Max new tokens",
184
+ minimum=1,
185
+ maximum=MAX_MAX_NEW_TOKENS,
186
+ step=1,
187
+ value=DEFAULT_MAX_NEW_TOKENS,
188
+ )
189
+
190
+ temperature = gr.Slider(
191
+ label="Temperature",
192
+ minimum=0.1,
193
+ maximum=4.0,
194
+ step=0.1,
195
+ value=0.1,
196
+ )
197
 
 
198
 
199
  with gr.Row():
200
  re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
 
225
  outputs = input_box,
226
  )
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  with gr.Row():
229
  gr.ClearButton([input_box, output_box])
230
+ submit_btn = gr.Button("Submit🏃")
231
+ submit_btn.click(generate, inputs=[input_box, sys_prompt, temperature, max_new_tokens], outputs=[output_box])
232
+
233
+ answer_btn = gr.Button("Answer🤔")
234
+ answer_btn.click(return_ground_truth, inputs=[], outputs=[ground_truth_display])
235
+
236
 
237
+ demo.queue(max_size=32).launch(enable_queue=True)
data/eval_board.csv CHANGED
@@ -1,7 +1,11 @@
1
- Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Links
2
- gpt-4-0314,99.0,94.1,15.6,17.7,56.6,https://openai.com/research/gpt-4
3
- gpt-3.5-turbo,83.4,60.8,16.0,39.4,49.9,https://chat.openai.com/
4
- text-davinci-003,85.7,84.0,23.7,34.4,57.0,https://platform.openai.com/docs/models/gpt-3-5
5
- llama-2-7b-chat-hf,0,0,0,0,0,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
6
- qwen-7b-chat,0,0,0,0,0,https://huggingface.co/Qwen/Qwen-7B-Chat
7
- internlm-7b-chat,0,0,0,0,0,https://huggingface.co/internlm/internlm-chat-7b
 
 
 
 
 
1
+ Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Model Size,Links
2
+ gpt-4-0314,98.7,93.6,16.4,17.1,56.5,unknown,https://openai.com/research/gpt-4
3
+ gpt-3.5-turbo-0301,83.5,60.7,59.0,39.0,60.6,unknown,https://chat.openai.com/
4
+ text-davinci-003,85.4,83.8,55.8,34.8,65.0,175B,https://platform.openai.com/docs/models/gpt-3-5
5
+ claude-instant-1.1,65.7,87.2,52.3,26.2,57.9,unknown,https://www.anthropic.com/index/introducing-claude
6
+ claude-1.3,89.7,82.3,37.3,56.6,66.5,unknown,https://www.anthropic.com/index/introducing-claude
7
+ flan-t5-xxl,79.4,96.8,20.7,4.8,50.4,11B,https://huggingface.co/google/flan-t5-xxl
8
+ flan-t5-xl,91.5,90.6,7.9,17.8,52.0,3B,https://huggingface.co/google/flan-t5-xl
9
+ flan-t5-large,71.5,77.3,26.2,29.6,51.2,780M,https://huggingface.co/google/flan-t5-large
10
+ flan-t5-base,84.6,51.2,17.0,50.2,50.8,250M,https://huggingface.co/google/flan-t5-base
11
+ flan-t5-small,51.8,50.1,46.5,49.5,49.5,60M,https://huggingface.co/google/flan-t5-small
src/__pycache__/css_html.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/css_html.cpython-38.pyc and b/src/__pycache__/css_html.cpython-38.pyc differ
 
src/__pycache__/demo.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/demo.cpython-38.pyc and b/src/__pycache__/demo.cpython-38.pyc differ
 
src/__pycache__/utils.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/utils.cpython-38.pyc and b/src/__pycache__/utils.cpython-38.pyc differ
 
src/css_html.py CHANGED
@@ -12,6 +12,10 @@ custom_css = """
12
  font-size: 16px !important;
13
  }
14
 
 
 
 
 
15
  #models-to-add-text {
16
  font-size: 18px !important;
17
  }
 
12
  font-size: 16px !important;
13
  }
14
 
15
+ #answer-text {
16
+ font-size: 28px !important;
17
+ }
18
+
19
  #models-to-add-text {
20
  font-size: 18px !important;
21
  }
src/demo.py CHANGED
@@ -6,12 +6,12 @@ from typing import Iterable
6
  import torch
7
  from huggingface_hub import HfApi
8
  from datasets import load_dataset
9
- from transformers import T5Tokenizer, T5ForConditionalGeneration
10
- from transformers import AutoTokenizer, AutoModelForCausalLM
11
 
12
 
13
- TOKEN = os.environ.get("HF_TOKEN", None)
14
 
 
15
 
16
  type2dataset = {
17
  "re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=TOKEN, split="prompt1"),
@@ -24,10 +24,15 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
24
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
25
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, token=TOKEN, device_map="auto").eval()
26
 
 
 
 
 
 
27
  # type2dataset = {}
28
 
29
 
30
- def generate(input_text, sys_prompt) -> str:
31
  sys_prompt = f'''[INST] <<SYS>>
32
  {sys_prompt}
33
  <</SYS>>
@@ -35,24 +40,44 @@ def generate(input_text, sys_prompt) -> str:
35
  '''
36
  input_str = sys_prompt + input_text + " [/INST]"
37
 
38
- input_ids = tokenizer(input_str, return_tensors="pt").input_ids.to('cuda')
39
- outputs = model.generate(input_ids, max_length=512)
40
 
41
- result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
42
 
43
- result = result.split(' [/INST]')
44
- result = result[0] + '\n\n' + result[1]
 
 
 
 
 
 
 
45
 
46
- return result
 
 
 
 
 
47
 
48
 
49
  def random_examples(dataset_key) -> str:
 
 
50
  # target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
51
  target_dataset = type2dataset[dataset_key]
52
 
53
  idx = random.randint(0, len(target_dataset) - 1)
54
  item = target_dataset[idx]
 
 
 
 
55
  return item['query']
56
 
57
 
 
 
 
58
 
 
6
  import torch
7
  from huggingface_hub import HfApi
8
  from datasets import load_dataset
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 
10
 
11
 
12
+ ground_truth = ""
13
 
14
+ TOKEN = os.environ.get("HF_TOKEN", None)
15
 
16
  type2dataset = {
17
  "re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=TOKEN, split="prompt1"),
 
24
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
25
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, token=TOKEN, device_map="auto").eval()
26
 
27
+
28
+ # model_id = "google/flan-t5-base"
29
+ # tokenizer = T5Tokenizer.from_pretrained(model_id)
30
+ # model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
31
+
32
  # type2dataset = {}
33
 
34
 
35
+ def generate(input_text, sys_prompt, temperature, max_new_tokens) -> str:
36
  sys_prompt = f'''[INST] <<SYS>>
37
  {sys_prompt}
38
  <</SYS>>
 
40
  '''
41
  input_str = sys_prompt + input_text + " [/INST]"
42
 
43
+ input_ids = tokenizer(input_str, return_tensors="pt").to('cuda')
 
44
 
45
+ streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
46
 
47
+ generate_kwargs = dict(
48
+ input_ids,
49
+ streamer=streamer,
50
+ max_new_tokens=max_new_tokens,
51
+ do_sample=True,
52
+ temperature=float(temperature)
53
+ )
54
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
55
+ t.start()
56
 
57
+ # Pull the generated text from the streamer, and update the model output.
58
+ model_output = ""
59
+ for new_text in streamer:
60
+ model_output += new_text
61
+ yield model_output
62
+ return model_output
63
 
64
 
65
  def random_examples(dataset_key) -> str:
66
+
67
+
68
  # target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
69
  target_dataset = type2dataset[dataset_key]
70
 
71
  idx = random.randint(0, len(target_dataset) - 1)
72
  item = target_dataset[idx]
73
+
74
+ global ground_truth
75
+ ground_truth = item['answer']
76
+
77
  return item['query']
78
 
79
 
80
+ def return_ground_truth() -> str:
81
+ correct_answer = ground_truth
82
+ return correct_answer
83
 
src/utils.py CHANGED
@@ -22,6 +22,7 @@ class AutoEvalColumn: # Auto evals column
22
  re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
23
  text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
24
  avg = ColumnContent("Avg", "number", True)
 
25
 
26
  link = ColumnContent("Links", "str", False)
27
 
 
22
  re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
23
  text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
24
  avg = ColumnContent("Avg", "number", True)
25
+ model_size = ColumnContent("Model Size", "markdown", True)
26
 
27
  link = ColumnContent("Links", "str", False)
28