Spaces:

Yeyito
/

llm_contamination_detector

Running

App Files Files Community

Yeyito commited on Dec 20, 2023

Commit

4e4454b

•

1 Parent(s): ca8085c

gsm8k fix, queue, ref_model column

Browse files

Files changed (15) hide show

app.py +22 -4
data/code_eval_board.csv +8 -8
data/queue.csv +17 -0
detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/eval.py +1 -2
detect-pretrain-code-contamination/src/run.py +4 -1
src/__pycache__/css_html.cpython-311.pyc +0 -0
src/__pycache__/envs.cpython-311.pyc +0 -0
src/__pycache__/text_content.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/utils.py +13 -0

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import sys
 import time
 import pandas as pd
 from threading import Thread
 # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
@@ -23,6 +24,8 @@ from src.utils import (
     make_clickable_names,
     styled_error,
     styled_message,
 )
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -33,7 +36,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 # CONFIGURATION:
 ref_model = "huggyllama/llama-7b"
 test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
-modelQueue = []
 def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -84,7 +88,7 @@ def worker_thread():
     global modelQueue, server
     while True:
         for submission in modelQueue:
-            #evaluate(submission[0],submission[1].split(" ")[0])
             #modelQueue.pop(modelQueue.index(submission))
             # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
@@ -98,7 +102,12 @@ def worker_thread():
 def queue(model,model_type):
     global modelQueue
-    modelQueue.append([model,model_type])
     print(f"QUEUE:\n{modelQueue}")
@@ -269,6 +278,15 @@ with demo:
                         "## 📤  Submit a model here:", elem_classes="markdown-text"
                     )
                     with gr.Column():
                         with gr.Row():
                             model_name = gr.Textbox(label="Model name")
                             revision_name = gr.Textbox(
@@ -288,7 +306,7 @@ with demo:
                                 interactive=True,
                             )
                             model_type = gr.Dropdown(
-                                choices=["🟢 base", "🔶 instruction-tuned"],
                                 label="Model type",
                                 multiselect=False,
                                 value=None,

 import time
 import pandas as pd
 from threading import Thread
+import numpy as np
 # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
     make_clickable_names,
     styled_error,
     styled_message,
+    EVAL_COLS,
+    EVAL_TYPES
 )
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 # CONFIGURATION:
 ref_model = "huggyllama/llama-7b"
 test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
+modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
+print(modelQueue)
 def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
     global modelQueue, server
     while True:
         for submission in modelQueue:
+            #evaluate(submission[1],submission[0].split(" ")[0])
             #modelQueue.pop(modelQueue.index(submission))
             # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
 def queue(model,model_type):
     global modelQueue
+    modelQueue.append([model_type,model])
+    file_path = "data/queue.csv"
+    with open(file_path, "a") as f:
+        f.write(f"\n{model_type},{model}")
+        f.close()
     print(f"QUEUE:\n{modelQueue}")
                         "## 📤  Submit a model here:", elem_classes="markdown-text"
                     )
                     with gr.Column():
+                        with gr.Column():
+                            with gr.Accordion(
+                                f"⏳ Evaluation Queue ({len(modelQueue)})",
+                                open=False,
+                            ):
+                                with gr.Row():
+                                    finished_eval_table = gr.components.Dataframe(
+                                            value=pd.DataFrame(modelQueue, columns=['Type','Model']),
+                                    )
                         with gr.Row():
                             model_name = gr.Textbox(label="Model name")
                             revision_name = gr.Textbox(
                                 interactive=True,
                             )
                             model_type = gr.Dropdown(
+                                choices=["🟢 base", "🔶 finetuned"],
                                 label="Model type",
                                 multiselect=False,
                                 value=None,

data/code_eval_board.csv CHANGED Viewed

@@ -1,8 +1,8 @@
-T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K
-🟢,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0
-🟢,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0
-🔶,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44
-🟢,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91
-🔶,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95
-🔶,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99
-🔶,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96

+T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Reference Model
+🟢,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0,huggyllama/llama-7b
+🟢,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0,huggyllama/llama-7b
+🔶,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44,huggyllama/llama-7b
+🟢,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91,huggyllama/llama-7b
+🔶,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95,huggyllama/llama-7b
+🔶,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99,huggyllama/llama-7b
+🔶,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96,huggyllama/llama-7b

data/queue.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+Type,Model
+🔶 finetuned, AIDC-ai-business/Marcoroni-7B-v3
+🔶 finetuned, openchat/openchat_3.5
+🔶 finetuned, teknium/OpenHermes-2.5-Mistral-7B
+🔶 finetuned, WizardLM/WizardMath-7B-V1.1
+🔶 finetuned, Intel/neural-chat-7b-v3-3
+🔶 finetuned, mistralai/Mistral-7B-Instruct-v0.2
+🔶 finetuned, ehartford/dolphin-2.1-mistral-7b
+🔶 finetuned, HuggingFaceH4/zephyr-7b-beta
+🔶 finetuned, berkeley-nest/Starling-LM-7B-alpha
+🔶 finetuned, Open-Orca/Mistral-7B-OpenOrca
+🔶 finetuned, amazon/MistralLite
+🔶 finetuned, meta-math/MetaMath-Mistral-7B
+🔶 finetuned, microsoft/Orca-2-7b
+🔶 finetuned, 01-ai/Yi-6B-200K
+🔶 finetuned, Yhyu13/LMCocktail-10.7B-v1
+🔶 finetuned, openchat/openchat-3.5-1210

detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc CHANGED Viewed

Binary files a/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc differ

detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc CHANGED Viewed

Binary files a/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc differ

detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc CHANGED Viewed

Binary files a/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc differ

detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc CHANGED Viewed

Binary files a/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc differ

detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc differ

detect-pretrain-code-contamination/src/eval.py CHANGED Viewed

@@ -147,8 +147,7 @@ def process_gsm8k(data):
     new_data = []
     for ex in data:
         new_ex = {}
-        #label = ;;
-        output = ex["answer"]
         new_ex["output"] = output
         new_ex["input"] = ex["question"] + " " + output
         new_data.append(new_ex)

     new_data = []
     for ex in data:
         new_ex = {}
+        output = ex["answer"].split('####')[0].strip()
         new_ex["output"] = output
         new_ex["input"] = ex["question"] + " " + output
         new_data.append(new_ex)

detect-pretrain-code-contamination/src/run.py CHANGED Viewed

@@ -44,7 +44,10 @@ def load_model(name1):
     if name1 not in models:
         model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
         model1.eval()
-        tokenizer1 = AutoTokenizer.from_pretrained(name1)
         tokenizer1.pad_token = tokenizer1.eos_token
         models[name1] = model1

     if name1 not in models:
         model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
         model1.eval()
+        if name1.contains('mistral') or name1.contains('Mistral'): #Loading default mistral tokenizers as some tokenizers don't work out of the box.
+            tokenizer1 = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+        else:
+            tokenizer1 = AutoTokenizer.from_pretrained(name1)
         tokenizer1.pad_token = tokenizer1.eos_token
         models[name1] = model1

src/__pycache__/css_html.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/css_html.cpython-311.pyc and b/src/__pycache__/css_html.cpython-311.pyc differ

src/__pycache__/envs.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/envs.cpython-311.pyc and b/src/__pycache__/envs.cpython-311.pyc differ

src/__pycache__/text_content.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/text_content.cpython-311.pyc and b/src/__pycache__/text_content.cpython-311.pyc differ

src/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/utils.cpython-311.pyc and b/src/__pycache__/utils.cpython-311.pyc differ

src/utils.py CHANGED Viewed

@@ -31,6 +31,7 @@ class AutoEvalColumn:  # Auto evals column
     Winogrande = ColumnContent("Winogrande", "number", True)
     GSM8K = ColumnContent("GSM8K", "number", True)
     dummy = ColumnContent("Models", "str", True)
 def model_hyperlink(link, model_name):
@@ -77,3 +78,15 @@ def is_model_on_hub(model_name: str, revision: str) -> bool:
     except Exception as e:
         print(f"Could not get the model config from the hub.: {e}")
         return False, "was not found on hub!"

     Winogrande = ColumnContent("Winogrande", "number", True)
     GSM8K = ColumnContent("GSM8K", "number", True)
     dummy = ColumnContent("Models", "str", True)
+    ref_model = ColumnContent("Reference Model", "str", True)
 def model_hyperlink(link, model_name):
     except Exception as e:
         print(f"Could not get the model config from the hub.: {e}")
         return False, "was not found on hub!"
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]