dh-mc commited on
Commit
d2150e8
·
1 Parent(s): 545719f

try 5-shot for open source models

Browse files
data/best_metrics.csv CHANGED
@@ -7,6 +7,7 @@ index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
7
  6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
8
  7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
9
  8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
10
- 9,gpt-4o-mini (10-shot),gpt-4o-mini (10-shot),0.6793333333333333,0.7728086050218999,0.6793333333333333,0.6916749681933937,0.9996666666666667
11
  10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
12
  11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
 
 
7
  6,Llama3.1-70B-Chinese-Chat,Llama3.1-70B-Chinese-Chat,0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
8
  7,Qwen2-72B-Instruct,Qwen2-72B-Instruct,0.784,0.8354349234761956,0.784,0.804194683154365,1.0
9
  8,Ensemble Model,Ensemble Model,0.8193333333333334,0.8407464756633664,0.8193333333333334,0.828054127213081,1.0
10
+ 9,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
11
  10,o1-mini (10-shot),o1-mini (10-shot),0.725,0.7892485648334764,0.725,0.7485623974683336,1.0
12
  11,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
13
+ 12,o1-preview (10-shot),o1-preview (10-shot),0.749,0.7964482186234537,0.749,0.7677316493549238,1.0
data/best_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/openai_metrics.csv CHANGED
@@ -13,5 +13,8 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
13
  30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
14
  40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
15
  50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
 
16
  10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
 
 
17
  10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
 
13
  30,gpt-4o,gpt-4o/shots-30,0.7886666666666666,0.8260847852316618,0.7886666666666666,0.8030949295928699,0.999
14
  40,gpt-4o,gpt-4o/shots-40,0.784,0.8233509309291644,0.784,0.7993336791122846,0.9973333333333333
15
  50,gpt-4o,gpt-4o/shots-50,0.787,0.8234800466218334,0.787,0.8013530974301947,0.9993333333333333
16
+ 0,o1-mini,o1-mini/shots-00,0.7083333333333334,0.7848098266888749,0.7083333333333334,0.7377068425566796,0.999
17
  10,o1-mini,o1-mini/shots-10,0.725,0.7892485648334764,0.725,0.7485623974683336,0.9943333333333333
18
+ 0,o1-preview,o1-preview/shots-00,0.721,0.7849371317342158,0.721,0.7451207069815194,0.998
19
+ 10,o1-preview,o1-preview/shots-10,0.749,0.7964482186234537,0.749,0.7677316493549238,0.9873333333333333
20
  10,gpt-4o-mini_batch,gpt-4o-mini_batch/shots-10,0.6576666666666666,0.7689201800674901,0.6576666666666666,0.6748319385295091,0.996
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-176/merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-264/merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/qwen2_7b/lora/sft_4bit/checkpoint-88/merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/qwen2_7b/lora/sft_4bit/merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
llm_toolkit/eval_shots.py CHANGED
@@ -99,7 +99,7 @@ def evaluate_model_with_num_shots(
99
  model_name,
100
  data_path,
101
  start_num_shots=0,
102
- range_num_shots=[0, 10],
103
  batch_size=1,
104
  max_new_tokens=2048,
105
  device="cuda",
 
99
  model_name,
100
  data_path,
101
  start_num_shots=0,
102
+ range_num_shots=[0, 5],
103
  batch_size=1,
104
  max_new_tokens=2048,
105
  device="cuda",
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -429,11 +429,15 @@ def get_metrics_df(df, variant="epoch"):
429
  perf_df = pd.DataFrame(
430
  columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
431
  )
432
- columns = [
433
- col
434
- for col in df.columns[5:]
435
- if variant in col or variant == "epoch" and "_torch." in col
436
- ]
 
 
 
 
437
  print("columns:", columns)
438
  for i, col in enumerate(columns):
439
  metrics = calc_metrics(df["label"], df[col], debug=False)
@@ -445,7 +449,8 @@ def get_metrics_df(df, variant="epoch"):
445
  if variant == "shots":
446
  parts = col.split("/shots-")
447
  new_model_metrics["shots"] = int(parts[1])
448
- # new_model_metrics["model"] = parts[0]
 
449
 
450
  new_model_metrics.update(metrics)
451
 
 
429
  perf_df = pd.DataFrame(
430
  columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
431
  )
432
+ columns = (
433
+ df.columns[5:]
434
+ if variant == "index"
435
+ else [
436
+ col
437
+ for col in df.columns[5:]
438
+ if variant in col or variant == "epoch" and "_torch." in col
439
+ ]
440
+ )
441
  print("columns:", columns)
442
  for i, col in enumerate(columns):
443
  metrics = calc_metrics(df["label"], df[col], debug=False)
 
449
  if variant == "shots":
450
  parts = col.split("/shots-")
451
  new_model_metrics["shots"] = int(parts[1])
452
+ if variant in new_model_metrics["model"]:
453
+ new_model_metrics["model"] = parts[0]
454
 
455
  new_model_metrics.update(metrics)
456
 
notebooks/00_Data Analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/04b_OpenAI-Models_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/eval-mgtv-shots_4bit.sh CHANGED
@@ -13,7 +13,7 @@ cat /etc/os-release
13
  lscpu
14
  grep MemTotal /proc/meminfo
15
 
16
- export START_NUM_SHOTS=10
17
 
18
  #$BASEDIR/scripts/eval-mgtv-internlm-20b.sh
19
 
 
13
  lscpu
14
  grep MemTotal /proc/meminfo
15
 
16
+ export START_NUM_SHOTS=5
17
 
18
  #$BASEDIR/scripts/eval-mgtv-internlm-20b.sh
19