Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 10, 2024

Commit

5a8f8d2

1 Parent(s): 5dc41da

open source LLM results almost done

Browse files

Files changed (15) hide show

data/Llama3.1-70B-Chinese-Chat_metrics.csv +12 -0
data/Llama3.1-8B-Chinese-Chat_metrics.csv +12 -7
data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv +12 -0
data/Qwen2-72B-Instruct_metrics.csv +8 -8
data/Qwen2-7B-Instruct_metrics.csv +12 -12
data/internlm2_5-7b-chat-1m_metrics.csv +12 -12
llm_toolkit/logical_reasoning_utils.py +42 -26
notebooks/00_Data Analysis.ipynb +0 -0
notebooks/01_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb +0 -0
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb +0 -0
notebooks/02b_Qwen2-72B-Instruct_analysis.ipynb +0 -0
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb +0 -0
notebooks/03b_Llama3.1-70B-Chinese-Chat_analysis.ipynb +0 -0

data/Llama3.1-70B-Chinese-Chat_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat_torch.bfloat16_4bit_lf,0.7636666666666667,0.7806653325131986,0.7636666666666667,0.7525813484548423,0.009666666666666667
+0.2,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-35_torch.bfloat16_4bit_lf,0.778,0.8148707737020212,0.778,0.7910805488003003,0.9996666666666667
+0.4,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-70_torch.bfloat16_4bit_lf,0.7306666666666667,0.8145782271710159,0.7306666666666667,0.7624724104697406,1.0
+0.6,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-105_torch.bfloat16_4bit_lf,0.7193333333333334,0.8213567226911125,0.7193333333333334,0.7560702640626931,1.0
+0.8,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-140_torch.bfloat16_4bit_lf,0.7563333333333333,0.826789897753756,0.7563333333333333,0.7815164366677209,1.0
+1.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-175_torch.bfloat16_4bit_lf,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
+1.2,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-210_torch.bfloat16_4bit_lf,0.7326666666666667,0.8265345821998035,0.7326666666666667,0.7644418492070342,1.0
+1.4,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-245_torch.bfloat16_4bit_lf,0.7556666666666667,0.8258994609525315,0.7556666666666667,0.7820405339757727,1.0
+1.6,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-280_torch.bfloat16_4bit_lf,0.757,0.8264461657684251,0.757,0.7834496144681513,1.0
+1.8,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-315_torch.bfloat16_4bit_lf,0.7546666666666667,0.8277723752096544,0.7546666666666667,0.7823584779069335,1.0
+2.0,shenzhi-wang/Llama3.1-70B-Chinese-Chat/checkpoint-350_torch.bfloat16_4bit_lf,0.7496666666666667,0.8282310230333227,0.7496666666666667,0.7791947625361637,1.0

data/Llama3.1-8B-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,7 +1,12 @@
-epoch,model,accuracy,precision,recall,f1
-0.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.23666666666666666,0.7457179631400438,0.23666666666666666,0.33962354850065374
-0.2,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.6256666666666667,0.827414387212707,0.6256666666666667,0.6935695138877099
-0.4,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.762,0.7899461556934093,0.762,0.7667008346960339
-0.6,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6803333333333333,0.79802978899557,0.6803333333333333,0.7212437740051865
-0.8,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7523333333333333,0.8074258170836324,0.7523333333333333,0.7736442997308933
-1.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-175_torch.float16_lf,0.737,0.8090588922502886,0.737,0.7637837184140026

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.707,0.7631091217915184,0.707,0.7243940517731183,0.3923333333333333
+0.2,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.709,0.7987219597893886,0.709,0.7427961200958145,1.0
+0.4,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.7163333333333334,0.8058657875960304,0.7163333333333334,0.7487811196109319,0.9993333333333333
+0.6,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6996666666666667,0.802722482275839,0.6996666666666667,0.7370938556711591,1.0
+0.8,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7716666666666666,0.8092193821623755,0.7716666666666666,0.7864287269398251,1.0
+1.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-175_torch.float16_lf,0.78,0.810582723471486,0.78,0.7924651054056209,1.0
+1.2,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7313333333333333,0.8157783263996798,0.7313333333333333,0.7628807622782868,1.0
+1.4,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-245_torch.float16_lf,0.751,0.8125856808988221,0.751,0.7745416635653988,1.0
+1.6,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-280_torch.float16_lf,0.739,0.8097375095673094,0.739,0.7662329023371559,1.0
+1.8,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-315_torch.float16_lf,0.7236666666666667,0.8145530585912838,0.7236666666666667,0.7580428816095297,1.0
+2.0,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-350_torch.float16_lf,0.7293333333333333,0.8151184301713545,0.7293333333333333,0.7616699266814145,1.0

data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat_torch.float16_lf,0.7113333333333334,0.70220546362905,0.7113333333333334,0.6894974942637364,0.004
+0.2,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-35_torch.float16_lf,0.702,0.7932731014186957,0.702,0.7342714734731689,1.0
+0.4,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-70_torch.float16_lf,0.742,0.78982949223512,0.742,0.7536681109811127,1.0
+0.6,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6596666666666666,0.7923396753604393,0.6596666666666666,0.7067542301676931,1.0
+0.8,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7146666666666667,0.7861341885687435,0.7146666666666667,0.7404677278137267,1.0
+1.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-175_torch.float16_lf,0.7326666666666667,0.7876867721932461,0.7326666666666667,0.7471869515031995,1.0
+1.2,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7016666666666667,0.7903119228393193,0.7016666666666667,0.7348708822385348,1.0
+1.4,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-245_torch.float16_lf,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
+1.6,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-280_torch.float16_lf,0.7156666666666667,0.7846106674095725,0.7156666666666667,0.7410042005708856,1.0
+1.8,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-315_torch.float16_lf,0.6916666666666667,0.7864256994491394,0.6916666666666667,0.7257499426487266,1.0
+2.0,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-350_torch.float16_lf,0.6976666666666667,0.7889443494370009,0.6976666666666667,0.7307996137659796,1.0

data/Qwen2-72B-Instruct_metrics.csv CHANGED Viewed

@@ -1,8 +1,8 @@
-epoch,model,accuracy,precision,recall,f1
-0.0,Qwen/Qwen2-72B-Instruct_torch.bfloat16_4bit_lf,0.7473333333333333,0.804122252986722,0.7473333333333333,0.7607828719113865
-0.2,Qwen/Qwen2-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7583333333333333,0.8199928526815756,0.7583333333333333,0.782751089787442
-0.4,Qwen/Qwen2-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.7366666666666667,0.8224865755517643,0.7366666666666667,0.7700627366337021
-0.6,Qwen/Qwen2-72B-Instruct/checkpoint-105_torch.bfloat16_4bit_lf,0.757,0.8253824826209251,0.757,0.784000409833628
-0.8,Qwen/Qwen2-72B-Instruct/checkpoint-140_torch.bfloat16_4bit_lf,0.7893333333333333,0.8229104753645825,0.7893333333333333,0.8033124955993173
-1.0,Qwen/Qwen2-72B-Instruct/checkpoint-175_torch.bfloat16_4bit_lf,0.7376666666666667,0.8243654864769323,0.7376666666666667,0.7699617360961548
-1.2,Qwen/Qwen2-72B-Instruct/checkpoint-210_torch.bfloat16_4bit_lf,0.763,0.8318882808702871,0.763,0.7901075708186186

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen/Qwen2-72B-Instruct_torch.bfloat16_4bit_lf,0.7473333333333333,0.804122252986722,0.7473333333333333,0.7607828719113865,0.9773333333333334
+0.2,Qwen/Qwen2-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7583333333333333,0.8199928526815756,0.7583333333333333,0.782751089787442,1.0
+0.4,Qwen/Qwen2-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.7366666666666667,0.8224865755517643,0.7366666666666667,0.7700627366337021,1.0
+0.6,Qwen/Qwen2-72B-Instruct/checkpoint-105_torch.bfloat16_4bit_lf,0.757,0.8253824826209251,0.757,0.784000409833628,1.0
+0.8,Qwen/Qwen2-72B-Instruct/checkpoint-140_torch.bfloat16_4bit_lf,0.7893333333333333,0.8229104753645825,0.7893333333333333,0.8033124955993173,1.0
+1.0,Qwen/Qwen2-72B-Instruct/checkpoint-175_torch.bfloat16_4bit_lf,0.7376666666666667,0.8243654864769323,0.7376666666666667,0.7699617360961548,1.0
+1.2,Qwen/Qwen2-72B-Instruct/checkpoint-210_torch.bfloat16_4bit_lf,0.763,0.8318882808702871,0.763,0.7901075708186186,1.0

data/Qwen2-7B-Instruct_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1
-0.0,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.6193333333333333,0.7555701755118281,0.6193333333333333,0.6726302447185493
-0.2,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058
-0.4,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183
-0.6,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848
-0.8,Qwen/Qwen2-7B-Instruct/checkpoint-140_torch.float16_lf,0.725,0.7952719247171957,0.725,0.7476238017654298
-1.0,Qwen/Qwen2-7B-Instruct/checkpoint-175_torch.float16_lf,0.6756666666666666,0.7810148934939715,0.6756666666666666,0.708653993277772
-1.2,Qwen/Qwen2-7B-Instruct/checkpoint-210_torch.float16_lf,0.7013333333333334,0.7969562600853992,0.7013333333333334,0.7362679665494508
-1.4,Qwen/Qwen2-7B-Instruct/checkpoint-245_torch.float16_lf,0.7326666666666667,0.7922538479314682,0.7326666666666667,0.755402136631717
-1.6,Qwen/Qwen2-7B-Instruct/checkpoint-280_torch.float16_lf,0.6983333333333334,0.785127298428753,0.6983333333333334,0.7292251109166867
-1.8,Qwen/Qwen2-7B-Instruct/checkpoint-315_torch.float16_lf,0.6783333333333333,0.785390767631834,0.6783333333333333,0.7164131321837346
-2.0,Qwen/Qwen2-7B-Instruct/checkpoint-350_torch.float16_lf,0.689,0.7929715746898984,0.689,0.7259993126510194

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.6203333333333333,0.7554720257311661,0.6203333333333333,0.6731632664545455,0.9973333333333333
+0.2,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058,0.9996666666666667
+0.4,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
+0.6,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848,1.0
+0.8,Qwen/Qwen2-7B-Instruct/checkpoint-140_torch.float16_lf,0.725,0.7952719247171957,0.725,0.7476238017654298,1.0
+1.0,Qwen/Qwen2-7B-Instruct/checkpoint-175_torch.float16_lf,0.6756666666666666,0.7810148934939715,0.6756666666666666,0.708653993277772,1.0
+1.2,Qwen/Qwen2-7B-Instruct/checkpoint-210_torch.float16_lf,0.7013333333333334,0.7969562600853992,0.7013333333333334,0.7362679665494508,1.0
+1.4,Qwen/Qwen2-7B-Instruct/checkpoint-245_torch.float16_lf,0.7326666666666667,0.7922538479314682,0.7326666666666667,0.755402136631717,0.9996666666666667
+1.6,Qwen/Qwen2-7B-Instruct/checkpoint-280_torch.float16_lf,0.6983333333333334,0.785127298428753,0.6983333333333334,0.7292251109166867,1.0
+1.8,Qwen/Qwen2-7B-Instruct/checkpoint-315_torch.float16_lf,0.6783333333333333,0.785390767631834,0.6783333333333333,0.7164131321837346,1.0
+2.0,Qwen/Qwen2-7B-Instruct/checkpoint-350_torch.float16_lf,0.689,0.7929715746898984,0.689,0.7259993126510194,1.0

data/internlm2_5-7b-chat-1m_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
-epoch,model,accuracy,precision,recall,f1
-0.0,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.5106666666666667,0.743213901498142,0.5106666666666667,0.5357333853323308
-0.2,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659
-0.4,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081
-0.6,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912
-0.8,internlm/internlm2_5-7b-chat-1m/checkpoint-140_torch.bfloat16_lf,0.803,0.8031411888150441,0.803,0.8028064320197301
-1.0,internlm/internlm2_5-7b-chat-1m/checkpoint-175_torch.bfloat16_lf,0.7676666666666667,0.8108441731715863,0.7676666666666667,0.7843187816704813
-1.2,internlm/internlm2_5-7b-chat-1m/checkpoint-210_torch.bfloat16_lf,0.7736666666666666,0.8091671780923799,0.7736666666666666,0.7876874850235454
-1.4,internlm/internlm2_5-7b-chat-1m/checkpoint-245_torch.bfloat16_lf,0.7623333333333333,0.8062291602218205,0.7623333333333333,0.777669094563925
-1.6,internlm/internlm2_5-7b-chat-1m/checkpoint-280_torch.bfloat16_lf,0.7553333333333333,0.8086197936829652,0.7553333333333333,0.7755588811428297
-1.8,internlm/internlm2_5-7b-chat-1m/checkpoint-315_torch.bfloat16_lf,0.748,0.8171996792797457,0.748,0.773990849396903
-2.0,internlm/internlm2_5-7b-chat-1m/checkpoint-350_torch.bfloat16_lf,0.756,0.8126875394266148,0.756,0.7777812522863184

+epoch,model,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.5106666666666667,0.743213901498142,0.5106666666666667,0.5357333853323308,1.0
+0.2,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659,1.0
+0.4,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081,1.0
+0.6,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912,1.0
+0.8,internlm/internlm2_5-7b-chat-1m/checkpoint-140_torch.bfloat16_lf,0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
+1.0,internlm/internlm2_5-7b-chat-1m/checkpoint-175_torch.bfloat16_lf,0.7676666666666667,0.8108441731715863,0.7676666666666667,0.7843187816704813,1.0
+1.2,internlm/internlm2_5-7b-chat-1m/checkpoint-210_torch.bfloat16_lf,0.7736666666666666,0.8091671780923799,0.7736666666666666,0.7876874850235454,1.0
+1.4,internlm/internlm2_5-7b-chat-1m/checkpoint-245_torch.bfloat16_lf,0.7623333333333333,0.8062291602218205,0.7623333333333333,0.777669094563925,1.0
+1.6,internlm/internlm2_5-7b-chat-1m/checkpoint-280_torch.bfloat16_lf,0.7553333333333333,0.8086197936829652,0.7553333333333333,0.7755588811428297,1.0
+1.8,internlm/internlm2_5-7b-chat-1m/checkpoint-315_torch.bfloat16_lf,0.748,0.8171996792797457,0.748,0.773990849396903,1.0
+2.0,internlm/internlm2_5-7b-chat-1m/checkpoint-350_torch.bfloat16_lf,0.756,0.8126875394266148,0.756,0.7777812522863184,1.0

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -95,7 +95,7 @@ def get_prompt_template(using_p1=True, chinese_prompt=True):
 def extract_answer(text, debug=False):
-    if text:
         # Remove the begin and end tokens
         text = re.sub(
             r".*?(assistant|\[/INST\]).+?\b",
@@ -117,6 +117,7 @@ def extract_answer(text, debug=False):
             print("--------\nstep 3:", text)
         text = text.split(".")[0].strip()
         if debug:
             print("--------\nstep 4:", text)
@@ -129,7 +130,9 @@ def extract_answer(text, debug=False):
         if debug:
             print("--------\nstep 5:", text)
-    return text
 def calc_metrics(references, predictions, debug=False):
@@ -137,16 +140,33 @@ def calc_metrics(references, predictions, debug=False):
         predictions
     ), f"lengths are difference: {len(references)} != {len(predictions)}"
     predictions = [extract_answer(text) for text in predictions]
-    correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
-    accuracy = sum(correct) / len(references)
     results = {"accuracy": accuracy}
     if debug:
-        incorrect_ids = [i for i, c in enumerate(correct) if c == 0]
         results["incorrect_ids"] = incorrect_ids
     return results
@@ -240,7 +260,7 @@ def get_metrics(df):
     rouge_l = []
     all_metrics = []
     for col in df.columns[2:]:
-        metrics = calc_metrics(df["english"], df[col], debug=True)
         print(f"{col}: {metrics}")
         accuracy.append(metrics["accuracy"])
@@ -290,38 +310,37 @@ def load_alpaca_data(data_path, using_p1=True, use_english_datasets=False):
     return df_alpaca
-def plot_value_counts(df, column, title=None):
     font_family = rcParams["font.family"]
     # Set the font to SimHei to support Chinese characters
     rcParams["font.family"] = "STHeiti"
     rcParams["axes.unicode_minus"] = (
         False  # This is to support the minus sign in Chinese.
     )
     plt.figure(figsize=(12, 6))
-    df[column].value_counts().plot(kind="bar")
     # add values on top of bars
-    for i, v in enumerate(df[column].value_counts()):
-        plt.text(i, v + 0.1, str(v), ha="center")
-    plt.xlabel(title or column)
     plt.show()
     rcParams["font.family"] = font_family
-def calc_metrics_for_col(df, col):
-    y_true = df["label"]
-    y_pred = df[col]
-    labels = np.unique(y_true)
-    accuracy = accuracy_score(y_true, y_pred)
-    precision = precision_score(y_true, y_pred, average="weighted", labels=labels)
-    recall = recall_score(y_true, y_pred, average="weighted", labels=labels)
-    f1 = f1_score(y_true, y_pred, average="weighted", labels=labels)
-    return accuracy, float(precision), float(recall), float(f1)
 def get_metrics_df(df):
@@ -329,15 +348,12 @@ def get_metrics_df(df):
         columns=["epoch", "model", "accuracy", "precision", "recall", "f1"]
     )
     for i, col in enumerate(df.columns[5:]):
-        accuracy, precision, recall, f1 = calc_metrics_for_col(df, col)
         new_model_metrics = {
             "epoch": i / 5,
             "model": col,
-            "accuracy": accuracy,
-            "precision": precision,
-            "recall": recall,
-            "f1": f1,
         }
         # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
         perf_df = pd.concat(

 def extract_answer(text, debug=False):
+    if text and isinstance(text, str):
         # Remove the begin and end tokens
         text = re.sub(
             r".*?(assistant|\[/INST\]).+?\b",
             print("--------\nstep 3:", text)
         text = text.split(".")[0].strip()
+        text = text.split("。")[0].strip()
         if debug:
             print("--------\nstep 4:", text)
         if debug:
             print("--------\nstep 5:", text)
+        return text.strip()
+    return ""
 def calc_metrics(references, predictions, debug=False):
         predictions
     ), f"lengths are difference: {len(references)} != {len(predictions)}"
+    labels = np.unique(references)
+    valid_classifications = [1 if p in labels else 0 for p in predictions]
     predictions = [extract_answer(text) for text in predictions]
+    accuracy = accuracy_score(references, predictions)
     results = {"accuracy": accuracy}
     if debug:
+        incorrect_ids = [i for i, p in enumerate(predictions) if p != references[i]]
         results["incorrect_ids"] = incorrect_ids
+    precision = precision_score(
+        references, predictions, average="weighted", labels=labels
+    )
+    results["precision"] = float(precision)
+    recall = recall_score(references, predictions, average="weighted", labels=labels)
+    results["recall"] = float(recall)
+    f1 = f1_score(references, predictions, average="weighted", labels=labels)
+    results["f1"] = float(f1)
+    results["ratio_valid_classifications"] = sum(valid_classifications) / len(
+        valid_classifications
+    )
     return results
     rouge_l = []
     all_metrics = []
     for col in df.columns[2:]:
+        metrics = calc_metrics(df["label"], df[col], debug=True)
         print(f"{col}: {metrics}")
         accuracy.append(metrics["accuracy"])
     return df_alpaca
+def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=None):
     font_family = rcParams["font.family"]
     # Set the font to SimHei to support Chinese characters
     rcParams["font.family"] = "STHeiti"
     rcParams["axes.unicode_minus"] = (
         False  # This is to support the minus sign in Chinese.
     )
+    if preprocess_func:
+        df["backup"] = df[column_name]
+        df[column_name] = df[column_name].apply(preprocess_func)
     plt.figure(figsize=(12, 6))
+    df[column_name].value_counts().plot(kind="bar")
     # add values on top of bars
+    for i, v in enumerate(df[column_name].value_counts()):
+        plt.text(i, v + offset, str(v), ha="center")
+    plt.xlabel(title or column_name)
     plt.show()
     rcParams["font.family"] = font_family
+    if preprocess_func:
+        df[column_name] = df["backup"]
+        df.drop(columns=["backup"], inplace=True)
+def calc_metrics_for_col(df, col):
+    metrics = calc_metrics(df["label"], df[col], debug=True)
+    return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
 def get_metrics_df(df):
         columns=["epoch", "model", "accuracy", "precision", "recall", "f1"]
     )
     for i, col in enumerate(df.columns[5:]):
+        metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
             "epoch": i / 5,
             "model": col,
         }
+        new_model_metrics.update(metrics)
         # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
         perf_df = pd.concat(

notebooks/00_Data Analysis.ipynb CHANGED Viewed