Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 12, 2024

Commit

468b88d

1 Parent(s): 6e932d8

completed eval/analysis

Browse files

Files changed (20) hide show

data/Llama3.1-70B-Chinese-Chat_metrics.csv +12 -12
data/Llama3.1-8B-Chinese-Chat_metrics.csv +12 -12
data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv +12 -12
data/Qwen2-72B-Instruct_metrics.csv +12 -8
data/Qwen2-72B-Instruct_results.csv +0 -0
data/Qwen2-7B-Instruct_metrics.csv +12 -12
data/internlm2_5-7b-chat-1m_metrics.csv +12 -12
data/openai_metrics.csv +15 -0
data/openai_results.csv +0 -0
llm_toolkit/eval_openai.py +7 -1
llm_toolkit/logical_reasoning_utils.py +167 -32
notebooks/00_Data Analysis.ipynb +0 -0
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb +0 -0
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb +0 -0
notebooks/02b_Qwen2-72B-Instruct_analysis.ipynb +0 -0
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb +0 -0
notebooks/03b_Llama3.1-70B-Chinese-Chat_analysis.ipynb +0 -0
notebooks/04_Few-shot_Prompting_OpenAI.ipynb +0 -0
notebooks/04b_OpenAI-Models_analysis.ipynb +0 -0

data/Llama3.1-70B-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat_torch.bfloat16_4bit_lf,0.7636666666666667,0.7806653325131986,0.7636666666666667,0.7525813484548423,0.009666666666666667
-0.2,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-35_torch.bfloat16_4bit_lf,0.778,0.8148707737020212,0.778,0.7910805488003003,0.9996666666666667
-0.4,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-70_torch.bfloat16_4bit_lf,0.7306666666666667,0.8145782271710159,0.7306666666666667,0.7624724104697406,1.0
-0.6,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-105_torch.bfloat16_4bit_lf,0.7193333333333334,0.8213567226911125,0.7193333333333334,0.7560702640626931,1.0
-0.8,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-140_torch.bfloat16_4bit_lf,0.7563333333333333,0.826789897753756,0.7563333333333333,0.7815164366677209,1.0
-1.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-175_torch.bfloat16_4bit_lf,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
-1.2,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-210_torch.bfloat16_4bit_lf,0.7326666666666667,0.8265345821998035,0.7326666666666667,0.7644418492070342,1.0
-1.4,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-245_torch.bfloat16_4bit_lf,0.7556666666666667,0.8258994609525315,0.7556666666666667,0.7820405339757727,1.0
-1.6,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-280_torch.bfloat16_4bit_lf,0.757,0.8264461657684251,0.757,0.7834496144681513,1.0
-1.8,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-315_torch.bfloat16_4bit_lf,0.7546666666666667,0.8277723752096544,0.7546666666666667,0.7823584779069335,1.0
-2.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-350_torch.bfloat16_4bit_lf,0.7496666666666667,0.8282310230333227,0.7496666666666667,0.7791947625361637,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat_torch.bfloat16_4bit_lf,0.7636666666666667,0.7806653325131986,0.7636666666666667,0.7525813484548423,0.009666666666666667
+0.2,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-35_torch.bfloat16_4bit_lf,0.778,0.8148707737020212,0.778,0.7910805488003003,0.9996666666666667
+0.4,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-70_torch.bfloat16_4bit_lf,0.7306666666666667,0.8145782271710159,0.7306666666666667,0.7624724104697406,1.0
+0.6,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-105_torch.bfloat16_4bit_lf,0.7193333333333334,0.8213567226911125,0.7193333333333334,0.7560702640626931,1.0
+0.8,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-140_torch.bfloat16_4bit_lf,0.7563333333333333,0.826789897753756,0.7563333333333333,0.7815164366677209,1.0
+1.0,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-175_torch.bfloat16_4bit_lf,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
+1.2,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-210_torch.bfloat16_4bit_lf,0.7326666666666667,0.8265345821998035,0.7326666666666667,0.7644418492070342,1.0
+1.4,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-245_torch.bfloat16_4bit_lf,0.7556666666666667,0.8258994609525315,0.7556666666666667,0.7820405339757727,1.0
+1.6,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-280_torch.bfloat16_4bit_lf,0.757,0.8264461657684251,0.757,0.7834496144681513,1.0
+1.8,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-315_torch.bfloat16_4bit_lf,0.7546666666666667,0.8277723752096544,0.7546666666666667,0.7823584779069335,1.0
+2.0,Llama3.1-70B-Chinese-Chat,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-350_torch.bfloat16_4bit_lf,0.7496666666666667,0.8282310230333227,0.7496666666666667,0.7791947625361637,1.0

data/Llama3.1-8B-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.707,0.7631091217915184,0.707,0.7243940517731183,0.3923333333333333
-0.2,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.709,0.7987219597893886,0.709,0.7427961200958145,1.0
-0.4,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.7163333333333334,0.8058657875960304,0.7163333333333334,0.7487811196109319,0.9993333333333333
-0.6,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6996666666666667,0.802722482275839,0.6996666666666667,0.7370938556711591,1.0
-0.8,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7716666666666666,0.8092193821623755,0.7716666666666666,0.7864287269398251,1.0
-1.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-175_torch.float16_lf,0.78,0.810582723471486,0.78,0.7924651054056209,1.0
-1.2,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7313333333333333,0.8157783263996798,0.7313333333333333,0.7628807622782868,1.0
-1.4,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-245_torch.float16_lf,0.751,0.8125856808988221,0.751,0.7745416635653988,1.0
-1.6,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-280_torch.float16_lf,0.739,0.8097375095673094,0.739,0.7662329023371559,1.0
-1.8,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-315_torch.float16_lf,0.7236666666666667,0.8145530585912838,0.7236666666666667,0.7580428816095297,1.0
-2.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-350_torch.float16_lf,0.7293333333333333,0.8151184301713545,0.7293333333333333,0.7616699266814145,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.707,0.7631091217915184,0.707,0.7243940517731183,0.3923333333333333
+0.2,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.709,0.7987219597893886,0.709,0.7427961200958145,1.0
+0.4,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.7163333333333334,0.8058657875960304,0.7163333333333334,0.7487811196109319,0.9993333333333333
+0.6,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6996666666666667,0.802722482275839,0.6996666666666667,0.7370938556711591,1.0
+0.8,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7716666666666666,0.8092193821623755,0.7716666666666666,0.7864287269398251,1.0
+1.0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-175_torch.float16_lf,0.78,0.810582723471486,0.78,0.7924651054056209,1.0
+1.2,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7313333333333333,0.8157783263996798,0.7313333333333333,0.7628807622782868,1.0
+1.4,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-245_torch.float16_lf,0.751,0.8125856808988221,0.751,0.7745416635653988,1.0
+1.6,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-280_torch.float16_lf,0.739,0.8097375095673094,0.739,0.7662329023371559,1.0
+1.8,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-315_torch.float16_lf,0.7236666666666667,0.8145530585912838,0.7236666666666667,0.7580428816095297,1.0
+2.0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-350_torch.float16_lf,0.7293333333333333,0.8151184301713545,0.7293333333333333,0.7616699266814145,1.0

data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat_torch.float16_lf,0.7113333333333334,0.70220546362905,0.7113333333333334,0.6894974942637364,0.004
-0.2,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-35_torch.float16_lf,0.702,0.7932731014186957,0.702,0.7342714734731689,1.0
-0.4,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-70_torch.float16_lf,0.742,0.78982949223512,0.742,0.7536681109811127,1.0
-0.6,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6596666666666666,0.7923396753604393,0.6596666666666666,0.7067542301676931,1.0
-0.8,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7146666666666667,0.7861341885687435,0.7146666666666667,0.7404677278137267,1.0
-1.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-175_torch.float16_lf,0.7326666666666667,0.7876867721932461,0.7326666666666667,0.7471869515031995,1.0
-1.2,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7016666666666667,0.7903119228393193,0.7016666666666667,0.7348708822385348,1.0
-1.4,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-245_torch.float16_lf,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
-1.6,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-280_torch.float16_lf,0.7156666666666667,0.7846106674095725,0.7156666666666667,0.7410042005708856,1.0
-1.8,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-315_torch.float16_lf,0.6916666666666667,0.7864256994491394,0.6916666666666667,0.7257499426487266,1.0
-2.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-350_torch.float16_lf,0.6976666666666667,0.7889443494370009,0.6976666666666667,0.7307996137659796,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat_torch.float16_lf,0.7113333333333334,0.70220546362905,0.7113333333333334,0.6894974942637364,0.004
+0.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-35_torch.float16_lf,0.702,0.7932731014186957,0.702,0.7342714734731689,1.0
+0.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-70_torch.float16_lf,0.742,0.78982949223512,0.742,0.7536681109811127,1.0
+0.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6596666666666666,0.7923396753604393,0.6596666666666666,0.7067542301676931,1.0
+0.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7146666666666667,0.7861341885687435,0.7146666666666667,0.7404677278137267,1.0
+1.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-175_torch.float16_lf,0.7326666666666667,0.7876867721932461,0.7326666666666667,0.7471869515031995,1.0
+1.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7016666666666667,0.7903119228393193,0.7016666666666667,0.7348708822385348,1.0
+1.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-245_torch.float16_lf,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
+1.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-280_torch.float16_lf,0.7156666666666667,0.7846106674095725,0.7156666666666667,0.7410042005708856,1.0
+1.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-315_torch.float16_lf,0.6916666666666667,0.7864256994491394,0.6916666666666667,0.7257499426487266,1.0
+2.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-350_torch.float16_lf,0.6976666666666667,0.7889443494370009,0.6976666666666667,0.7307996137659796,1.0

data/Qwen2-72B-Instruct_metrics.csv CHANGED Viewed

@@ -1,8 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Qwen/Qwen2-72B-Instruct_torch.bfloat16_4bit_lf,0.7473333333333333,0.804122252986722,0.7473333333333333,0.7607828719113865,0.9773333333333334
-0.2,Qwen/Qwen2-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7583333333333333,0.8199928526815756,0.7583333333333333,0.782751089787442,1.0
-0.4,Qwen/Qwen2-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.7366666666666667,0.8224865755517643,0.7366666666666667,0.7700627366337021,1.0
-0.6,Qwen/Qwen2-72B-Instruct/checkpoint-105_torch.bfloat16_4bit_lf,0.757,0.8253824826209251,0.757,0.784000409833628,1.0
-0.8,Qwen/Qwen2-72B-Instruct/checkpoint-140_torch.bfloat16_4bit_lf,0.7893333333333333,0.8229104753645825,0.7893333333333333,0.8033124955993173,1.0
-1.0,Qwen/Qwen2-72B-Instruct/checkpoint-175_torch.bfloat16_4bit_lf,0.7376666666666667,0.8243654864769323,0.7376666666666667,0.7699617360961548,1.0
-1.2,Qwen/Qwen2-72B-Instruct/checkpoint-210_torch.bfloat16_4bit_lf,0.763,0.8318882808702871,0.763,0.7901075708186186,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct_torch.bfloat16_4bit_lf,0.7516666666666667,0.7949378981748352,0.7516666666666667,0.7572499605227642,0.9773333333333334
+0.2,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7583333333333333,0.8199928526815756,0.7583333333333333,0.782751089787442,1.0
+0.4,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.7366666666666667,0.8224865755517643,0.7366666666666667,0.7700627366337021,1.0
+0.6,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-105_torch.bfloat16_4bit_lf,0.757,0.8253824826209251,0.757,0.784000409833628,1.0
+0.8,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-140_torch.bfloat16_4bit_lf,0.7893333333333333,0.8229104753645825,0.7893333333333333,0.8033124955993173,1.0
+1.0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-175_torch.bfloat16_4bit_lf,0.7376666666666667,0.8243654864769323,0.7376666666666667,0.7699617360961548,1.0
+1.2,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-210_torch.bfloat16_4bit_lf,0.763,0.8318882808702871,0.763,0.7901075708186186,1.0
+1.4,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-245_torch.bfloat16_4bit_lf,0.7656666666666667,0.8288272203240518,0.7656666666666667,0.790627109330698,1.0
+1.6,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-280_torch.bfloat16_4bit_lf,0.7693333333333333,0.8292798021666021,0.7693333333333333,0.7930169589012503,1.0
+1.8,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-315_torch.bfloat16_4bit_lf,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
+2.0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/checkpoint-350_torch.bfloat16_4bit_lf,0.7736666666666666,0.8330147983140184,0.7736666666666666,0.7973657072550873,1.0

data/Qwen2-72B-Instruct_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Qwen2-7B-Instruct_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.6203333333333333,0.7554720257311661,0.6203333333333333,0.6731632664545455,0.9973333333333333
-0.2,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058,0.9996666666666667
-0.4,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
-0.6,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848,1.0
-0.8,Qwen/Qwen2-7B-Instruct/checkpoint-140_torch.float16_lf,0.725,0.7952719247171957,0.725,0.7476238017654298,1.0
-1.0,Qwen/Qwen2-7B-Instruct/checkpoint-175_torch.float16_lf,0.6756666666666666,0.7810148934939715,0.6756666666666666,0.708653993277772,1.0
-1.2,Qwen/Qwen2-7B-Instruct/checkpoint-210_torch.float16_lf,0.7013333333333334,0.7969562600853992,0.7013333333333334,0.7362679665494508,1.0
-1.4,Qwen/Qwen2-7B-Instruct/checkpoint-245_torch.float16_lf,0.7326666666666667,0.7922538479314682,0.7326666666666667,0.755402136631717,0.9996666666666667
-1.6,Qwen/Qwen2-7B-Instruct/checkpoint-280_torch.float16_lf,0.6983333333333334,0.785127298428753,0.6983333333333334,0.7292251109166867,1.0
-1.8,Qwen/Qwen2-7B-Instruct/checkpoint-315_torch.float16_lf,0.6783333333333333,0.785390767631834,0.6783333333333333,0.7164131321837346,1.0
-2.0,Qwen/Qwen2-7B-Instruct/checkpoint-350_torch.float16_lf,0.689,0.7929715746898984,0.689,0.7259993126510194,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.6203333333333333,0.7554720257311661,0.6203333333333333,0.6731632664545455,0.9973333333333333
+0.2,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058,0.9996666666666667
+0.4,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
+0.6,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848,1.0
+0.8,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-140_torch.float16_lf,0.725,0.7952719247171957,0.725,0.7476238017654298,1.0
+1.0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-175_torch.float16_lf,0.6756666666666666,0.7810148934939715,0.6756666666666666,0.708653993277772,1.0
+1.2,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-210_torch.float16_lf,0.7013333333333334,0.7969562600853992,0.7013333333333334,0.7362679665494508,1.0
+1.4,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-245_torch.float16_lf,0.7326666666666667,0.7922538479314682,0.7326666666666667,0.755402136631717,0.9996666666666667
+1.6,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-280_torch.float16_lf,0.6983333333333334,0.785127298428753,0.6983333333333334,0.7292251109166867,1.0
+1.8,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-315_torch.float16_lf,0.6783333333333333,0.785390767631834,0.6783333333333333,0.7164131321837346,1.0
+2.0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-350_torch.float16_lf,0.689,0.7929715746898984,0.689,0.7259993126510194,1.0

data/internlm2_5-7b-chat-1m_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.5106666666666667,0.743213901498142,0.5106666666666667,0.5357333853323308,1.0
-0.2,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659,1.0
-0.4,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081,1.0
-0.6,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912,1.0
-0.8,internlm/internlm2_5-7b-chat-1m/checkpoint-140_torch.bfloat16_lf,0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
-1.0,internlm/internlm2_5-7b-chat-1m/checkpoint-175_torch.bfloat16_lf,0.7676666666666667,0.8108441731715863,0.7676666666666667,0.7843187816704813,1.0
-1.2,internlm/internlm2_5-7b-chat-1m/checkpoint-210_torch.bfloat16_lf,0.7736666666666666,0.8091671780923799,0.7736666666666666,0.7876874850235454,1.0
-1.4,internlm/internlm2_5-7b-chat-1m/checkpoint-245_torch.bfloat16_lf,0.7623333333333333,0.8062291602218205,0.7623333333333333,0.777669094563925,1.0
-1.6,internlm/internlm2_5-7b-chat-1m/checkpoint-280_torch.bfloat16_lf,0.7553333333333333,0.8086197936829652,0.7553333333333333,0.7755588811428297,1.0
-1.8,internlm/internlm2_5-7b-chat-1m/checkpoint-315_torch.bfloat16_lf,0.748,0.8171996792797457,0.748,0.773990849396903,1.0
-2.0,internlm/internlm2_5-7b-chat-1m/checkpoint-350_torch.bfloat16_lf,0.756,0.8126875394266148,0.756,0.7777812522863184,1.0

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.5106666666666667,0.743213901498142,0.5106666666666667,0.5357333853323308,1.0
+0.2,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659,1.0
+0.4,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081,1.0
+0.6,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912,1.0
+0.8,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-140_torch.bfloat16_lf,0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
+1.0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-175_torch.bfloat16_lf,0.7676666666666667,0.8108441731715863,0.7676666666666667,0.7843187816704813,1.0
+1.2,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-210_torch.bfloat16_lf,0.7736666666666666,0.8091671780923799,0.7736666666666666,0.7876874850235454,1.0
+1.4,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-245_torch.bfloat16_lf,0.7623333333333333,0.8062291602218205,0.7623333333333333,0.777669094563925,1.0
+1.6,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-280_torch.bfloat16_lf,0.7553333333333333,0.8086197936829652,0.7553333333333333,0.7755588811428297,1.0
+1.8,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-315_torch.bfloat16_lf,0.748,0.8171996792797457,0.748,0.773990849396903,1.0
+2.0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-350_torch.bfloat16_lf,0.756,0.8126875394266148,0.756,0.7777812522863184,1.0

data/openai_metrics.csv ADDED Viewed

	@@ -0,0 +1,15 @@

+shots,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0,gpt-4o-mini,0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,0.9916666666666667
+5,gpt-4o-mini,0.7176666666666667,0.7767294185987051,0.7176666666666667,0.7181068311028772,0.9996666666666667
+10,gpt-4o-mini,0.6793333333333333,0.7728086050218999,0.6793333333333333,0.6916749681933937,0.9983333333333333
+20,gpt-4o-mini,0.6623333333333333,0.7686706009175459,0.6623333333333333,0.6798015109939115,0.998
+30,gpt-4o-mini,0.6873333333333334,0.7684209723431035,0.6873333333333334,0.6913018667081989,0.999
+40,gpt-4o-mini,0.6923333333333334,0.7639874967862498,0.6923333333333334,0.6924934068935911,0.9986666666666667
+50,gpt-4o-mini,0.717,0.7692638634416518,0.717,0.7105227254860433,0.9993333333333333
+0,gpt-4o,0.782,0.8204048322982596,0.782,0.7953019682198627,0.066
+5,gpt-4o,0.7873333333333333,0.8230974205170392,0.7873333333333333,0.8000290527498529,0.998
+10,gpt-4o,0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
+20,gpt-4o,0.7816666666666666,0.8204541793856629,0.7816666666666666,0.7967017169880498,0.9993333333333333
+30,gpt-4o,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
+40,gpt-4o,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
+50,gpt-4o,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333

data/openai_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/eval_openai.py CHANGED Viewed

@@ -57,7 +57,13 @@ def evaluate_model_with_num_shots(
     for num_shots in range_num_shots:
         print(f"*** Evaluating with num_shots: {num_shots}")
-        predictions = eval_openai(eval_dataset, model=model_name, max_new_tokens=max_new_tokens)
         model_name_with_shorts = (
             result_column_name
             if result_column_name

     for num_shots in range_num_shots:
         print(f"*** Evaluating with num_shots: {num_shots}")
+        predictions = eval_openai(
+            eval_dataset,
+            model=model_name,
+            max_new_tokens=max_new_tokens,
+            num_shots=num_shots,
+            train_dataset=datasets["train"].to_pandas(),
+        )
         model_name_with_shorts = (
             result_column_name
             if result_column_name

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -3,14 +3,21 @@ import re
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from matplotlib import rcParams
 from matplotlib.ticker import MultipleLocator
 from datasets import load_dataset
 import numpy as np
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-from tqdm import tqdm
 print(f"loading {__file__}")
@@ -61,17 +68,16 @@ P2 = """你是一个情景猜谜游戏的主持人。游戏规则如下：
 请严格按照这些规则回答参与者提出的问题。
-**谜面:** {}
-**谜底:** {}
-**参与者提出的问题:** {}
 """
 P2_en = """You are the host of a situational guessing game. The rules of the game are as follows:
 1. Participants will receive a riddle that describes a simple yet difficult to understand event.
-2. The host knows the answer, which is the solution to the riddle.
 3. Participants can ask any closed-ended questions to uncover the truth of the event.
 4. For each question, the host will respond with one of the following five options based on the actual situation: Yes, No, Unimportant, Correct answer, or Incorrect questioning. The criteria for each response are as follows:
    - If the riddle and answer can provide an answer to the question, respond with: Yes or No
@@ -82,14 +88,35 @@ P2_en = """You are the host of a situational guessing game. The rules of the gam
 Please strictly follow these rules when answering the participant's questions.
-**Riddle:** {}
-**Answer:** {}
-**Participant's question:** {}
 """
-system_prompt = "You are an expert in logical reasoning."
 def get_prompt_template(using_p1=True, chinese_prompt=True):
     if using_p1:
@@ -98,6 +125,40 @@ def get_prompt_template(using_p1=True, chinese_prompt=True):
         return P2 if chinese_prompt else P2_en
 def extract_answer(text, debug=False):
     if text and isinstance(text, str):
         # Remove the begin and end tokens
@@ -121,7 +182,9 @@ def extract_answer(text, debug=False):
             print("--------\nstep 3:", text)
         text = text.split(".")[0].strip()
         text = text.split("。")[0].strip()
         if debug:
             print("--------\nstep 4:", text)
@@ -186,7 +249,9 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
             df = dataset
         else:
             df = dataset.to_pandas()
-        df.drop(columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore")
     else:
         df = pd.read_csv(results_path, on_bad_lines="warn")
@@ -329,7 +394,7 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
         df["backup"] = df[column_name]
         df[column_name] = df[column_name].apply(preprocess_func)
-    plt.figure(figsize=(12, 6))
     df[column_name].value_counts().plot(kind="bar")
     # add values on top of bars
     for i, v in enumerate(df[column_name].value_counts()):
@@ -342,6 +407,7 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
     rcParams["font.family"] = font_family
     if preprocess_func:
         df[column_name] = df["backup"]
         df.drop(columns=["backup"], inplace=True)
@@ -351,16 +417,22 @@ def calc_metrics_for_col(df, col):
     return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
-def get_metrics_df(df):
     perf_df = pd.DataFrame(
-        columns=["epoch", "model", "accuracy", "precision", "recall", "f1"]
     )
     for i, col in enumerate(df.columns[5:]):
         metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
-            "epoch": i / 5,
-            "model": col,
         }
         new_model_metrics.update(metrics)
         # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
@@ -371,51 +443,61 @@ def get_metrics_df(df):
     return perf_df
-def plot_metrics(perf_df, model_name):
-    fig, ax = plt.subplots(1, 1, figsize=(12, 6))
     # Ensure the lengths of perf_df["epoch"], perf_df["accuracy"], and perf_df["f1"] are the same
     min_length = min(
-        len(perf_df["epoch"]), len(perf_df["accuracy"]), len(perf_df["f1"])
     )
     perf_df = perf_df.iloc[:min_length]
     # Plot accuracy and f1 on the same chart with different markers
-    ax.plot(perf_df["epoch"], perf_df["accuracy"], marker="o", label="Accuracy")
     ax.plot(
-        perf_df["epoch"], perf_df["f1"], marker="s", label="F1 Score"
     )  # Square marker for F1 Score
     # Add values on top of points
     for i in range(min_length):
         ax.annotate(
             f"{perf_df['accuracy'].iloc[i]*100:.2f}%",
-            (perf_df["epoch"].iloc[i], perf_df["accuracy"].iloc[i]),
             ha="center",
             va="bottom",  # Move accuracy numbers below the points
             xytext=(0, -15),
             textcoords="offset points",
             fontsize=10,
         )
         ax.annotate(
             f"{perf_df['f1'].iloc[i]*100:.2f}%",
-            (perf_df["epoch"].iloc[i], perf_df["f1"].iloc[i]),
             ha="center",
             va="top",  # Move F1 score numbers above the points
             xytext=(0, 15),  # Offset by 15 points vertically
             textcoords="offset points",
             fontsize=10,
         )
     # Set y-axis limit
-    # ax.set_ylim(0.49, 0.825)
     # Add title and labels
-    ax.set_xlabel("Epoch (0: base model, 0.2 - 2: fine-tuned models)")
     ax.set_ylabel("Accuracy and F1 Score")
-    # Set x-axis grid spacing to 0.2
-    ax.xaxis.set_major_locator(MultipleLocator(0.2))
     ax.set_title(f"Performance Analysis Across Checkpoints for the {model_name} Model")
     # Rotate x labels
@@ -460,13 +542,66 @@ def reasoning_with_openai(
     return response.content
-def eval_openai(eval_dataset, model="gpt-4o-mini", max_new_tokens=300):
-    user_prompt = get_prompt_template(using_p1=False, chinese_prompt=True)
     total = len(eval_dataset)
     predictions = []
     for i in tqdm(range(total)):
-        output = reasoning_with_openai(eval_dataset.iloc[i], user_prompt,model=model, max_tokens=max_new_tokens)
         predictions.append(output)
     return predictions

 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 import pandas as pd
+from tqdm import tqdm
 import seaborn as sns
 import matplotlib.pyplot as plt
 from matplotlib import rcParams
 from matplotlib.ticker import MultipleLocator
 from datasets import load_dataset
 import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix,
+)
 print(f"loading {__file__}")
 请严格按照这些规则回答参与者提出的问题。
+谜面: {}
+谜底: {}
+参与者提出的问题: {}
+回答:
 """
 P2_en = """You are the host of a situational guessing game. The rules of the game are as follows:
 1. Participants will receive a riddle that describes a simple yet difficult to understand event.
+2. The host knows the truth, which is the solution to the riddle.
 3. Participants can ask any closed-ended questions to uncover the truth of the event.
 4. For each question, the host will respond with one of the following five options based on the actual situation: Yes, No, Unimportant, Correct answer, or Incorrect questioning. The criteria for each response are as follows:
    - If the riddle and answer can provide an answer to the question, respond with: Yes or No
 Please strictly follow these rules when answering the participant's questions.
+Riddle: {}
+Truth: {}
+Participant's question: {}
+"""
+system_prompt = "You are an expert in logical reasoning."
+P2_few_shot = """你是一个情景猜谜游戏的主持人。游戏规则如下：
+1. 参与者会得到一个谜面，谜面会描述一个简单又难以理解的事件。
+2. 主持人知道谜底，谜底是谜面的答案。
+3. 参与者可以询问任何封闭式问题来找寻事件的真相。
+4. 对于每个问题，主持人将根据实际情况回答以下五个选项之一：是、不是、不重要、回答正确、问法错误。各回答的判断标准如下：
+   - 若谜面和谜底能找到问题的答案，回答：是或者不是
+   - 若谜面和谜底不能直接或者间接推断出问题的答案，回答：不重要
+   - 若参与者提问不是一个封闭式问题或者问题难以理解，回答：问法错误
+   - 若参与者提问基本还原了谜底真相，回答：回答正确
+5. 回答中不能添加任何其它信息，也不能省略选项中的任何一个字。例如，不可以把“不是”省略成“不”。
+请严格按照这些规则回答参与者提出的问题。
+示例输入和输出:
+{examples}
+谜面: {}
+谜底: {}
+参与者提出的问题: {}
+回答:
 """
 def get_prompt_template(using_p1=True, chinese_prompt=True):
     if using_p1:
         return P2 if chinese_prompt else P2_en
+def get_few_shot_prompt_template(num_shots, train_dataset, debug=False):
+    if num_shots == 0:
+        return get_prompt_template(using_p1=False, chinese_prompt=True)
+    labels = train_dataset["label"].unique()
+    if debug:
+        print("num_shots:", num_shots)
+        print("labels:", labels)
+    examples = ""
+    index = 0
+    while num_shots > 0:
+        for label in labels:
+            while train_dataset["label"][index] != label:
+                index += 1
+            row = train_dataset.iloc[index]
+            examples += f"""谜面: {row["puzzle"]}
+谜底: {row["truth"]}
+参与者提出的问题: {row["text"]}
+回答: {row["label"]}
+"""
+            num_shots -= 1
+            if num_shots == 0:
+                break
+    prompt = P2_few_shot.replace("{examples}", examples)
+    if debug:
+        print("P2_few_shot:", prompt)
+    return prompt
 def extract_answer(text, debug=False):
     if text and isinstance(text, str):
         # Remove the begin and end tokens
             print("--------\nstep 3:", text)
         text = text.split(".")[0].strip()
+        text = text.split("\n")[0].strip()
         text = text.split("。")[0].strip()
+        text = text.replace("回答: ", "").strip()
         if debug:
             print("--------\nstep 4:", text)
             df = dataset
         else:
             df = dataset.to_pandas()
+        df.drop(
+            columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore"
+        )
     else:
         df = pd.read_csv(results_path, on_bad_lines="warn")
         df["backup"] = df[column_name]
         df[column_name] = df[column_name].apply(preprocess_func)
+    plt.figure(figsize=(8, 4))
     df[column_name].value_counts().plot(kind="bar")
     # add values on top of bars
     for i, v in enumerate(df[column_name].value_counts()):
     rcParams["font.family"] = font_family
     if preprocess_func:
+        plot_confusion_matrix(df["label"], df[column_name])
         df[column_name] = df["backup"]
         df.drop(columns=["backup"], inplace=True)
     return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
+def get_metrics_df(df, variant="epoch"):
     perf_df = pd.DataFrame(
+        columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
     )
     for i, col in enumerate(df.columns[5:]):
         metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
+            variant: i / 5 if variant == "epoch" else i + 1,
+            "model": col if "/" not in col else col.split("/")[1].split("_torch")[0],
+            "run": col,
         }
+        if variant == "shots":
+            parts = col.split("/shots-")
+            new_model_metrics["shots"] = int(parts[1])
+            new_model_metrics["model"] = parts[0]
         new_model_metrics.update(metrics)
         # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
     return perf_df
+def plot_metrics(perf_df, model_name, variant="epoch", offset=0.01):
+    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
+    perf_df = perf_df[perf_df["model"] == model_name]
     # Ensure the lengths of perf_df["epoch"], perf_df["accuracy"], and perf_df["f1"] are the same
     min_length = min(
+        len(perf_df[variant]), len(perf_df["accuracy"]), len(perf_df["f1"])
     )
     perf_df = perf_df.iloc[:min_length]
     # Plot accuracy and f1 on the same chart with different markers
     ax.plot(
+        perf_df[variant], perf_df["accuracy"], marker="o", label="Accuracy", color="r"
+    )
+    ax.plot(
+        perf_df[variant], perf_df["f1"], marker="s", label="F1 Score", color="b"
     )  # Square marker for F1 Score
     # Add values on top of points
     for i in range(min_length):
+        print(f"{perf_df[variant].iloc[i]}: {perf_df['run'].iloc[i]}")
         ax.annotate(
             f"{perf_df['accuracy'].iloc[i]*100:.2f}%",
+            (perf_df[variant].iloc[i], perf_df["accuracy"].iloc[i]),
             ha="center",
             va="bottom",  # Move accuracy numbers below the points
             xytext=(0, -15),
             textcoords="offset points",
             fontsize=10,
+            color="r",
         )
         ax.annotate(
             f"{perf_df['f1'].iloc[i]*100:.2f}%",
+            (perf_df[variant].iloc[i], perf_df["f1"].iloc[i]),
             ha="center",
             va="top",  # Move F1 score numbers above the points
             xytext=(0, 15),  # Offset by 15 points vertically
             textcoords="offset points",
             fontsize=10,
+            color="b",
         )
     # Set y-axis limit
+    ylimits = ax.get_ylim()
+    ax.set_ylim(ylimits[0] - offset, ylimits[1] + offset)
     # Add title and labels
+    ax.set_xlabel(
+        "Epoch (0: base model, 0.2 - 2: fine-tuned models)"
+        if variant == "epoch"
+        else "Number of Shots"
+    )
     ax.set_ylabel("Accuracy and F1 Score")
+    ax.xaxis.set_major_locator(MultipleLocator(0.2 if variant == "epoch" else 5))
     ax.set_title(f"Performance Analysis Across Checkpoints for the {model_name} Model")
     # Rotate x labels
     return response.content
+def eval_openai(
+    eval_dataset,
+    model="gpt-4o-mini",
+    max_new_tokens=300,
+    num_shots=0,
+    train_dataset=None,
+):
+    user_prompt = (
+        get_prompt_template(using_p1=False, chinese_prompt=True)
+        if num_shots == 0
+        else get_few_shot_prompt_template(num_shots, train_dataset)
+    )
+    print("user_prompt:", user_prompt)
     total = len(eval_dataset)
     predictions = []
     for i in tqdm(range(total)):
+        output = reasoning_with_openai(
+            eval_dataset.iloc[i], user_prompt, model=model, max_tokens=max_new_tokens
+        )
         predictions.append(output)
     return predictions
+def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
+    font_family = rcParams["font.family"]
+    # Set the font to SimHei to support Chinese characters
+    rcParams["font.family"] = "STHeiti"
+    rcParams["axes.unicode_minus"] = (
+        False  # This is to support the minus sign in Chinese.
+    )
+    labels = np.unique(y_true)
+    y_pred = [extract_answer(text) for text in y_pred]
+    cm = confusion_matrix(y_true, y_pred)
+    cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
+    fig, ax = plt.subplots(figsize=(8, 8))
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt=".4f",
+        cmap="Blues",
+        xticklabels=labels,
+        yticklabels=labels,
+    )
+    ax.set_title(title)
+    ax.set_xlabel("Predicted labels")
+    ax.set_ylabel("True labels")
+    plt.show()
+    rcParams["font.family"] = font_family
+def majority_vote(r1, r2, r3):
+    label = r2
+    if r1 == r3:
+        label = r1
+    return label

notebooks/00_Data Analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02b_Qwen2-72B-Instruct_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/03b_Llama3.1-70B-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/04_Few-shot_Prompting_OpenAI.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/04b_OpenAI-Models_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff