Yeyito commited on
Commit
4e4454b
โ€ข
1 Parent(s): ca8085c

gsm8k fix, queue, ref_model column

Browse files
app.py CHANGED
@@ -5,6 +5,7 @@ import sys
5
  import time
6
  import pandas as pd
7
  from threading import Thread
 
8
 
9
  # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
10
  project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
@@ -23,6 +24,8 @@ from src.utils import (
23
  make_clickable_names,
24
  styled_error,
25
  styled_message,
 
 
26
  )
27
 
28
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -33,7 +36,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
33
  # CONFIGURATION:
34
  ref_model = "huggyllama/llama-7b"
35
  test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
36
- modelQueue = []
 
37
 
38
  def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
39
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -84,7 +88,7 @@ def worker_thread():
84
  global modelQueue, server
85
  while True:
86
  for submission in modelQueue:
87
- #evaluate(submission[0],submission[1].split(" ")[0])
88
  #modelQueue.pop(modelQueue.index(submission))
89
 
90
  # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
@@ -98,7 +102,12 @@ def worker_thread():
98
 
99
  def queue(model,model_type):
100
  global modelQueue
101
- modelQueue.append([model,model_type])
 
 
 
 
 
102
  print(f"QUEUE:\n{modelQueue}")
103
 
104
 
@@ -269,6 +278,15 @@ with demo:
269
  "## ๐Ÿ“ค Submit a model here:", elem_classes="markdown-text"
270
  )
271
  with gr.Column():
 
 
 
 
 
 
 
 
 
272
  with gr.Row():
273
  model_name = gr.Textbox(label="Model name")
274
  revision_name = gr.Textbox(
@@ -288,7 +306,7 @@ with demo:
288
  interactive=True,
289
  )
290
  model_type = gr.Dropdown(
291
- choices=["๐ŸŸข base", "๐Ÿ”ถ instruction-tuned"],
292
  label="Model type",
293
  multiselect=False,
294
  value=None,
 
5
  import time
6
  import pandas as pd
7
  from threading import Thread
8
+ import numpy as np
9
 
10
  # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
11
  project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
 
24
  make_clickable_names,
25
  styled_error,
26
  styled_message,
27
+ EVAL_COLS,
28
+ EVAL_TYPES
29
  )
30
 
31
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
36
  # CONFIGURATION:
37
  ref_model = "huggyllama/llama-7b"
38
  test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
39
+ modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
40
+ print(modelQueue)
41
 
42
  def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
43
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 
88
  global modelQueue, server
89
  while True:
90
  for submission in modelQueue:
91
+ #evaluate(submission[1],submission[0].split(" ")[0])
92
  #modelQueue.pop(modelQueue.index(submission))
93
 
94
  # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
 
102
 
103
  def queue(model,model_type):
104
  global modelQueue
105
+ modelQueue.append([model_type,model])
106
+
107
+ file_path = "data/queue.csv"
108
+ with open(file_path, "a") as f:
109
+ f.write(f"\n{model_type},{model}")
110
+ f.close()
111
  print(f"QUEUE:\n{modelQueue}")
112
 
113
 
 
278
  "## ๐Ÿ“ค Submit a model here:", elem_classes="markdown-text"
279
  )
280
  with gr.Column():
281
+ with gr.Column():
282
+ with gr.Accordion(
283
+ f"โณ Evaluation Queue ({len(modelQueue)})",
284
+ open=False,
285
+ ):
286
+ with gr.Row():
287
+ finished_eval_table = gr.components.Dataframe(
288
+ value=pd.DataFrame(modelQueue, columns=['Type','Model']),
289
+ )
290
  with gr.Row():
291
  model_name = gr.Textbox(label="Model name")
292
  revision_name = gr.Textbox(
 
306
  interactive=True,
307
  )
308
  model_type = gr.Dropdown(
309
+ choices=["๐ŸŸข base", "๐Ÿ”ถ finetuned"],
310
  label="Model type",
311
  multiselect=False,
312
  value=None,
data/code_eval_board.csv CHANGED
@@ -1,8 +1,8 @@
1
- T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K
2
- ๐ŸŸข,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0
3
- ๐ŸŸข,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0
4
- ๐Ÿ”ถ,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44
5
- ๐ŸŸข,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91
6
- ๐Ÿ”ถ,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95
7
- ๐Ÿ”ถ,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99
8
- ๐Ÿ”ถ,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96
 
1
+ T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Reference Model
2
+ ๐ŸŸข,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0,huggyllama/llama-7b
3
+ ๐ŸŸข,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0,huggyllama/llama-7b
4
+ ๐Ÿ”ถ,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44,huggyllama/llama-7b
5
+ ๐ŸŸข,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91,huggyllama/llama-7b
6
+ ๐Ÿ”ถ,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95,huggyllama/llama-7b
7
+ ๐Ÿ”ถ,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99,huggyllama/llama-7b
8
+ ๐Ÿ”ถ,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96,huggyllama/llama-7b
data/queue.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type,Model
2
+ ๐Ÿ”ถ finetuned, AIDC-ai-business/Marcoroni-7B-v3
3
+ ๐Ÿ”ถ finetuned, openchat/openchat_3.5
4
+ ๐Ÿ”ถ finetuned, teknium/OpenHermes-2.5-Mistral-7B
5
+ ๐Ÿ”ถ finetuned, WizardLM/WizardMath-7B-V1.1
6
+ ๐Ÿ”ถ finetuned, Intel/neural-chat-7b-v3-3
7
+ ๐Ÿ”ถ finetuned, mistralai/Mistral-7B-Instruct-v0.2
8
+ ๐Ÿ”ถ finetuned, ehartford/dolphin-2.1-mistral-7b
9
+ ๐Ÿ”ถ finetuned, HuggingFaceH4/zephyr-7b-beta
10
+ ๐Ÿ”ถ finetuned, berkeley-nest/Starling-LM-7B-alpha
11
+ ๐Ÿ”ถ finetuned, Open-Orca/Mistral-7B-OpenOrca
12
+ ๐Ÿ”ถ finetuned, amazon/MistralLite
13
+ ๐Ÿ”ถ finetuned, meta-math/MetaMath-Mistral-7B
14
+ ๐Ÿ”ถ finetuned, microsoft/Orca-2-7b
15
+ ๐Ÿ”ถ finetuned, 01-ai/Yi-6B-200K
16
+ ๐Ÿ”ถ finetuned, Yhyu13/LMCocktail-10.7B-v1
17
+ ๐Ÿ”ถ finetuned, openchat/openchat-3.5-1210
detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc differ
 
detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc differ
 
detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc differ
 
detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc differ
 
detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc differ
 
detect-pretrain-code-contamination/src/eval.py CHANGED
@@ -147,8 +147,7 @@ def process_gsm8k(data):
147
  new_data = []
148
  for ex in data:
149
  new_ex = {}
150
- #label = ;;
151
- output = ex["answer"]
152
  new_ex["output"] = output
153
  new_ex["input"] = ex["question"] + " " + output
154
  new_data.append(new_ex)
 
147
  new_data = []
148
  for ex in data:
149
  new_ex = {}
150
+ output = ex["answer"].split('####')[0].strip()
 
151
  new_ex["output"] = output
152
  new_ex["input"] = ex["question"] + " " + output
153
  new_data.append(new_ex)
detect-pretrain-code-contamination/src/run.py CHANGED
@@ -44,7 +44,10 @@ def load_model(name1):
44
  if name1 not in models:
45
  model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
46
  model1.eval()
47
- tokenizer1 = AutoTokenizer.from_pretrained(name1)
 
 
 
48
 
49
  tokenizer1.pad_token = tokenizer1.eos_token
50
  models[name1] = model1
 
44
  if name1 not in models:
45
  model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
46
  model1.eval()
47
+ if name1.contains('mistral') or name1.contains('Mistral'): #Loading default mistral tokenizers as some tokenizers don't work out of the box.
48
+ tokenizer1 = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
49
+ else:
50
+ tokenizer1 = AutoTokenizer.from_pretrained(name1)
51
 
52
  tokenizer1.pad_token = tokenizer1.eos_token
53
  models[name1] = model1
src/__pycache__/css_html.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/css_html.cpython-311.pyc and b/src/__pycache__/css_html.cpython-311.pyc differ
 
src/__pycache__/envs.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/envs.cpython-311.pyc and b/src/__pycache__/envs.cpython-311.pyc differ
 
src/__pycache__/text_content.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/text_content.cpython-311.pyc and b/src/__pycache__/text_content.cpython-311.pyc differ
 
src/__pycache__/utils.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/utils.cpython-311.pyc and b/src/__pycache__/utils.cpython-311.pyc differ
 
src/utils.py CHANGED
@@ -31,6 +31,7 @@ class AutoEvalColumn: # Auto evals column
31
  Winogrande = ColumnContent("Winogrande", "number", True)
32
  GSM8K = ColumnContent("GSM8K", "number", True)
33
  dummy = ColumnContent("Models", "str", True)
 
34
 
35
 
36
  def model_hyperlink(link, model_name):
@@ -77,3 +78,15 @@ def is_model_on_hub(model_name: str, revision: str) -> bool:
77
  except Exception as e:
78
  print(f"Could not get the model config from the hub.: {e}")
79
  return False, "was not found on hub!"
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  Winogrande = ColumnContent("Winogrande", "number", True)
32
  GSM8K = ColumnContent("GSM8K", "number", True)
33
  dummy = ColumnContent("Models", "str", True)
34
+ ref_model = ColumnContent("Reference Model", "str", True)
35
 
36
 
37
  def model_hyperlink(link, model_name):
 
78
  except Exception as e:
79
  print(f"Could not get the model config from the hub.: {e}")
80
  return False, "was not found on hub!"
81
+
82
+ @dataclass(frozen=True)
83
+ class EvalQueueColumn: # Queue column
84
+ model = ColumnContent("model", "markdown", True)
85
+ revision = ColumnContent("revision", "str", True)
86
+ private = ColumnContent("private", "bool", True)
87
+ precision = ColumnContent("precision", "str", True)
88
+ weight_type = ColumnContent("weight_type", "str", "Original")
89
+ status = ColumnContent("status", "str", True)
90
+
91
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
92
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]