Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 20, 2024

Commit

a8683cf

1 Parent(s): 0baa6cc

counted few-shot prompts for all models

Browse files

Files changed (5) hide show

data/all_model_token_counts.csv +85 -0
data/openai_metrics.csv +4 -1
llm_toolkit/logical_reasoning_utils.py +11 -4
notebooks/04b_OpenAI-Models_analysis.ipynb +0 -0
notebooks/06b_Open-Source-Models_analysis.ipynb +0 -0

data/all_model_token_counts.csv ADDED Viewed

	@@ -0,0 +1,85 @@

+model_name,num_shots,max,min,mean,std
+Mistral-7B-v0.3-Chinese-Chat,0,928,694,799.354,15.567384660085061
+internlm2_5-7b-chat,0,511,426,461.91766666666666,7.767732430462529
+internlm2_5-7b-chat-1m,0,511,426,461.91766666666666,7.767732430462529
+Qwen2-7B-Instruct,0,517,426,465.33866666666665,8.617118029244592
+Llama3.1-8B-Chinese-Chat,0,652,512,571.091,9.115687078710652
+internlm2_5-20b-chat,0,511,426,461.91766666666666,7.767732430462529
+Llama3.1-70B-Chinese-Chat,0,652,512,571.091,9.115687078710652
+Qwen2-72B-Instruct,0,517,426,465.33866666666665,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,5,2573,2339,2444.354,15.567384660085061
+internlm2_5-7b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
+internlm2_5-7b-chat-1m,5,1351,1266,1301.9176666666667,7.767732430462529
+Qwen2-7B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
+Llama3.1-8B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
+internlm2_5-20b-chat,5,1351,1266,1301.9176666666667,7.767732430462529
+Llama3.1-70B-Chinese-Chat,5,1818,1678,1737.091,9.115687078710652
+Qwen2-72B-Instruct,5,1381,1290,1329.3386666666668,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,10,4119,3885,3990.354,15.567384660085061
+internlm2_5-7b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
+internlm2_5-7b-chat-1m,10,2245,2160,2195.9176666666667,7.767732430462529
+Qwen2-7B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
+Llama3.1-8B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
+internlm2_5-20b-chat,10,2245,2160,2195.9176666666667,7.767732430462529
+Llama3.1-70B-Chinese-Chat,10,2914,2774,2833.091,9.115687078710652
+Qwen2-72B-Instruct,10,2289,2198,2237.3386666666665,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,20,7392,7158,7263.354,15.567384660085061
+internlm2_5-7b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
+internlm2_5-7b-chat-1m,20,4065,3980,4015.9176666666667,7.767732430462529
+Qwen2-7B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
+Llama3.1-8B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
+internlm2_5-20b-chat,20,4065,3980,4015.9176666666667,7.767732430462529
+Llama3.1-70B-Chinese-Chat,20,5283,5143,5202.091,9.115687078710652
+Qwen2-72B-Instruct,20,4176,4085,4124.3386666666665,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,30,10804,10570,10675.354,15.567384660085061
+internlm2_5-7b-chat,30,5903,5818,5853.917666666666,7.767732430462529
+internlm2_5-7b-chat-1m,30,5903,5818,5853.917666666666,7.767732430462529
+Qwen2-7B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
+Llama3.1-8B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
+internlm2_5-20b-chat,30,5903,5818,5853.917666666666,7.767732430462529
+Llama3.1-70B-Chinese-Chat,30,7768,7628,7687.091,9.115687078710652
+Qwen2-72B-Instruct,30,6107,6016,6055.3386666666665,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,40,14152,13918,14023.354,15.567384660085061
+internlm2_5-7b-chat,40,7709,7624,7659.917666666666,7.767732430462529
+internlm2_5-7b-chat-1m,40,7709,7624,7659.917666666666,7.767732430462529
+Qwen2-7B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
+Llama3.1-8B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
+internlm2_5-20b-chat,40,7709,7624,7659.917666666666,7.767732430462529
+Llama3.1-70B-Chinese-Chat,40,10217,10077,10136.091,9.115687078710652
+Qwen2-72B-Instruct,40,8010,7919,7958.3386666666665,8.617118029244592
+Mistral-7B-v0.3-Chinese-Chat,50,17588,17354,17459.354,15.567384660085061
+internlm2_5-7b-chat,50,9561,9476,9511.917666666666,7.767732430462529
+internlm2_5-7b-chat-1m,50,9561,9476,9511.917666666666,7.767732430462529
+Qwen2-7B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
+Llama3.1-8B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
+internlm2_5-20b-chat,50,9561,9476,9511.917666666666,7.767732430462529
+Llama3.1-70B-Chinese-Chat,50,12719,12579,12638.091,9.115687078710652
+Qwen2-72B-Instruct,50,9961,9870,9909.338666666667,8.617118029244592
+gpt-4o,0,606,464,524.8063333333333,10.057594723695004
+gpt-4o-mini,0,606,464,524.8063333333333,10.057594723695004
+o1-preview,0,925,682,797.5953333333333,16.41724967580933
+o1-mini,0,925,682,797.5953333333333,16.41724967580933
+gpt-4o,5,1711,1569,1629.8063333333334,10.057594723695004
+gpt-4o-mini,5,1711,1569,1629.8063333333334,10.057594723695004
+o1-preview,5,2649,2406,2521.595333333333,16.41724967580933
+o1-mini,5,2649,2406,2521.595333333333,16.41724967580933
+gpt-4o,10,2781,2639,2699.8063333333334,10.057594723695004
+gpt-4o-mini,10,2781,2639,2699.8063333333334,10.057594723695004
+o1-preview,10,4282,4039,4154.595333333334,16.41724967580933
+o1-mini,10,4282,4039,4154.595333333334,16.41724967580933
+gpt-4o,20,5110,4968,5028.806333333333,10.057594723695004
+gpt-4o-mini,20,5110,4968,5028.806333333333,10.057594723695004
+o1-preview,20,7781,7538,7653.595333333334,16.41724967580933
+o1-mini,20,7781,7538,7653.595333333334,16.41724967580933
+gpt-4o,30,7572,7430,7490.806333333333,10.057594723695004
+gpt-4o-mini,30,7572,7430,7490.806333333333,10.057594723695004
+o1-preview,30,11459,11216,11331.595333333333,16.41724967580933
+o1-mini,30,11459,11216,11331.595333333333,16.41724967580933
+gpt-4o,40,9998,9856,9916.806333333334,10.057594723695004
+gpt-4o-mini,40,9998,9856,9916.806333333334,10.057594723695004
+o1-preview,40,15061,14818,14933.595333333333,16.41724967580933
+o1-mini,40,15061,14818,14933.595333333333,16.41724967580933
+gpt-4o,50,12476,12334,12394.806333333334,10.057594723695004
+gpt-4o-mini,50,12476,12334,12394.806333333334,10.057594723695004
+o1-preview,50,18760,18517,18632.595333333335,16.41724967580933
+o1-mini,50,18760,18517,18632.595333333335,16.41724967580933

data/openai_metrics.csv CHANGED Viewed

@@ -14,7 +14,10 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
 50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
 0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
 10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
 0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
 10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
-10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996

 40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
 50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
 0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
+5,o1-mini,o1-mini/shots-05,0.724,0.7905045610386181,0.724,0.7482963122126776,0.9966666666666667
 10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
+20,o1-mini,o1-mini/shots-20,0.7343333333333333,0.786101455887261,0.7343333333333333,0.7535300565051624,0.9946666666666667
 0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
+5,o1-preview,o1-preview/shots-05,0.7313333333333333,0.7878283093765627,0.7313333333333333,0.7535489719321234,0.979
 10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
+20,o1-preview,o1-preview/shots-20,0.7443333333333333,0.7911442834260676,0.7443333333333333,0.7625144090816939,0.9853333333333333

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -269,6 +269,7 @@ def load_logical_reasoning_dataset(
     chinese_prompt=True,
     test_data=None,
     num_shots=0,
 ):
     postfix = "" if chinese_prompt else "_en"
     train_data_file = data_path + f"/train{postfix}.csv"
@@ -319,10 +320,16 @@ def load_logical_reasoning_dataset(
                 texts.append(prompt + output + tokenizer.eos_token if output else "")
             return {"train_text": texts, "prompt": prompts}
-        datasets = datasets.map(
-            formatting_prompts_func,
-            batched=True,
-        )
     print(datasets)
     return datasets

     chinese_prompt=True,
     test_data=None,
     num_shots=0,
+    format_test_only=False,
 ):
     postfix = "" if chinese_prompt else "_en"
     train_data_file = data_path + f"/train{postfix}.csv"
                 texts.append(prompt + output + tokenizer.eos_token if output else "")
             return {"train_text": texts, "prompt": prompts}
+        if format_test_only:
+            datasets["test"] = datasets["test"].map(
+                formatting_prompts_func,
+                batched=True,
+            )
+        else:
+            datasets = datasets.map(
+                formatting_prompts_func,
+                batched=True,
+            )
     print(datasets)
     return datasets

notebooks/04b_OpenAI-Models_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/06b_Open-Source-Models_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff