dh-mc commited on
Commit
cf912f1
·
1 Parent(s): 397a2fa

ready for few shots eval

Browse files
llm_toolkit/eval.py DELETED
@@ -1,67 +0,0 @@
1
- import os
2
- import sys
3
- import torch
4
- from dotenv import find_dotenv, load_dotenv
5
-
6
- found_dotenv = find_dotenv(".env")
7
-
8
- if len(found_dotenv) == 0:
9
- found_dotenv = find_dotenv(".env.example")
10
- print(f"loading env vars from: {found_dotenv}")
11
- load_dotenv(found_dotenv, override=False)
12
-
13
- path = os.path.dirname(found_dotenv)
14
- print(f"Adding {path} to sys.path")
15
- sys.path.append(path)
16
-
17
- from llm_toolkit.translation_engine import *
18
- from llm_toolkit.translation_utils import *
19
-
20
- model_name = os.getenv("MODEL_NAME")
21
- adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
22
- load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
23
- data_path = os.getenv("DATA_PATH")
24
- results_path = os.getenv("RESULTS_PATH")
25
-
26
- print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
27
-
28
- gpu_stats = torch.cuda.get_device_properties(0)
29
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
30
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
31
- print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
32
- print(f"{start_gpu_memory} GB of memory reserved.")
33
-
34
- model, tokenizer = load_model(
35
- model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
36
- )
37
-
38
- gpu_stats = torch.cuda.get_device_properties(0)
39
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
40
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
41
- print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
42
- print(f"{start_gpu_memory} GB of memory reserved.")
43
-
44
- datasets = load_translation_dataset(data_path, tokenizer)
45
-
46
- print("Evaluating model: " + model_name)
47
- predictions = eval_model(model, tokenizer, datasets["test"])
48
-
49
- gpu_stats = torch.cuda.get_device_properties(0)
50
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
51
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
52
- print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
53
- print(f"{start_gpu_memory} GB of memory reserved.")
54
-
55
- if adapter_name_or_path is not None:
56
- model_name += "_" + adapter_name_or_path.split("/")[-1]
57
-
58
- save_results(
59
- model_name,
60
- results_path,
61
- datasets["test"],
62
- predictions,
63
- debug=True,
64
- )
65
-
66
- metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
67
- print(metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm_toolkit/eval_lf.py DELETED
@@ -1,110 +0,0 @@
1
- import os
2
- import sys
3
- import torch
4
- from dotenv import find_dotenv, load_dotenv
5
- from llamafactory.chat import ChatModel
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
7
-
8
- found_dotenv = find_dotenv(".env")
9
-
10
- if len(found_dotenv) == 0:
11
- found_dotenv = find_dotenv(".env.example")
12
- print(f"loading env vars from: {found_dotenv}")
13
- load_dotenv(found_dotenv, override=False)
14
-
15
- path = os.path.dirname(found_dotenv)
16
- print(f"Adding {path} to sys.path")
17
- sys.path.append(path)
18
-
19
- from llm_toolkit.translation_utils import *
20
-
21
- model_name = os.getenv("MODEL_NAME")
22
- adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
23
- load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
24
- data_path = os.getenv("DATA_PATH")
25
- results_path = os.getenv("RESULTS_PATH")
26
-
27
- print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
28
-
29
-
30
- def load_model(
31
- model_name,
32
- max_seq_length=2048,
33
- dtype=torch.bfloat16,
34
- load_in_4bit=False,
35
- adapter_name_or_path=None,
36
- ):
37
- print(f"loading model: {model_name}")
38
-
39
- if adapter_name_or_path:
40
- template = "llama3" if "llama-3" in model_name.lower() else "chatml"
41
-
42
- args = dict(
43
- model_name_or_path=model_name,
44
- adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
45
- template=template, # same to the one in training
46
- finetuning_type="lora", # same to the one in training
47
- quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
48
- )
49
- chat_model = ChatModel(args)
50
- return chat_model.engine.model, chat_model.engine.tokenizer
51
-
52
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
53
- bnb_config = BitsAndBytesConfig(
54
- load_in_4bit=load_in_4bit,
55
- bnb_4bit_quant_type="nf4",
56
- bnb_4bit_use_double_quant=False,
57
- bnb_4bit_compute_dtype=dtype,
58
- )
59
-
60
- model = AutoModelForCausalLM.from_pretrained(
61
- model_name,
62
- quantization_config=bnb_config,
63
- torch_dtype=dtype,
64
- trust_remote_code=True,
65
- device_map="auto",
66
- )
67
-
68
- return model, tokenizer
69
-
70
-
71
- gpu_stats = torch.cuda.get_device_properties(0)
72
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
73
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
74
- print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
75
- print(f"{start_gpu_memory} GB of memory reserved.")
76
-
77
- model, tokenizer = load_model(
78
- model_name, load_in_4bit=load_in_4bit, adapter_name_or_path=adapter_name_or_path
79
- )
80
-
81
- gpu_stats = torch.cuda.get_device_properties(0)
82
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
83
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
84
- print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
85
- print(f"{start_gpu_memory} GB of memory reserved.")
86
-
87
- datasets = load_translation_dataset(data_path, tokenizer)
88
-
89
- print("Evaluating model: " + model_name)
90
- predictions = eval_model(model, tokenizer, datasets["test"])
91
-
92
- gpu_stats = torch.cuda.get_device_properties(0)
93
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
94
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
95
- print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
96
- print(f"{start_gpu_memory} GB of memory reserved.")
97
-
98
- if adapter_name_or_path is not None:
99
- model_name += "_" + adapter_name_or_path.split("/")[-1]
100
-
101
- save_results(
102
- model_name,
103
- results_path,
104
- datasets["test"],
105
- predictions,
106
- debug=True,
107
- )
108
-
109
- metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
110
- print(metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm_toolkit/eval_shots.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from dotenv import find_dotenv, load_dotenv
5
+
6
+ found_dotenv = find_dotenv(".env")
7
+
8
+ if len(found_dotenv) == 0:
9
+ found_dotenv = find_dotenv(".env.example")
10
+ print(f"loading env vars from: {found_dotenv}")
11
+ load_dotenv(found_dotenv, override=False)
12
+
13
+ path = os.path.dirname(found_dotenv)
14
+ print(f"Adding {path} to sys.path")
15
+ sys.path.append(path)
16
+
17
+ from llm_toolkit.llm_utils import *
18
+ from llm_toolkit.logical_reasoning_utils import *
19
+
20
+ device = check_gpu()
21
+ is_cuda = torch.cuda.is_available()
22
+
23
+ model_name = os.getenv("MODEL_NAME")
24
+ adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
25
+ load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
26
+ data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
27
+ results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
28
+ use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
29
+ batch_size = int(os.getenv("BATCH_SIZE", 1))
30
+ using_llama_factory = os.getenv("USING_LLAMA_FACTORY") == "true"
31
+ max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
32
+ start_num_shots = int(os.getenv("START_NUM_SHOTS", 0))
33
+
34
+ print(
35
+ model_name,
36
+ adapter_name_or_path,
37
+ load_in_4bit,
38
+ data_path,
39
+ results_path,
40
+ max_new_tokens,
41
+ batch_size,
42
+ )
43
+
44
+ dtype = (
45
+ torch.float32
46
+ if os.getenv("USE_FLOAT32_FOR_INFERENCE") == "true"
47
+ else (
48
+ torch.bfloat16
49
+ if os.getenv("USE_BF16_FOR_INFERENCE") == "true"
50
+ else torch.float16
51
+ )
52
+ )
53
+
54
+ if is_cuda:
55
+ torch.cuda.empty_cache()
56
+ gpu_stats = torch.cuda.get_device_properties(0)
57
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
58
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
59
+ print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
60
+ print(f"{start_gpu_memory} GB of memory reserved.")
61
+
62
+ torch.cuda.empty_cache()
63
+
64
+ model, tokenizer = load_model(
65
+ model_name,
66
+ load_in_4bit=load_in_4bit,
67
+ adapter_name_or_path=adapter_name_or_path,
68
+ using_llama_factory=using_llama_factory,
69
+ dtype=dtype,
70
+ )
71
+
72
+ if is_cuda:
73
+ gpu_stats = torch.cuda.get_device_properties(0)
74
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
75
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
76
+ print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
77
+ print(f"{start_gpu_memory} GB of memory reserved.")
78
+
79
+
80
+ def on_num_shots_step_completed(model_name, dataset, predictions):
81
+ save_results(
82
+ model_name,
83
+ results_path,
84
+ dataset,
85
+ predictions,
86
+ )
87
+
88
+ metrics = calc_metrics(dataset["label"], predictions, debug=True)
89
+ print(f"{model_name} metrics: {metrics}")
90
+
91
+
92
+ if adapter_name_or_path is not None:
93
+ model_name += "/" + adapter_name_or_path.split("/")[-1]
94
+
95
+
96
+ def evaluate_model_with_num_shots(
97
+ model,
98
+ tokenizer,
99
+ model_name,
100
+ data_path,
101
+ start_num_shots=0,
102
+ range_num_shots=[10],
103
+ batch_size=1,
104
+ max_new_tokens=2048,
105
+ device="cuda",
106
+ ):
107
+ print(f"Evaluating model: {model_name} on {device}")
108
+
109
+ for num_shots in range_num_shots:
110
+ if num_shots < start_num_shots:
111
+ continue
112
+
113
+ print(f"*** Evaluating with num_shots: {num_shots}")
114
+
115
+ datasets = load_logical_reasoning_dataset(
116
+ data_path,
117
+ tokenizer=tokenizer,
118
+ chinese_prompt=not use_english_datasets,
119
+ using_p1=False,
120
+ )
121
+ if len(sys.argv) > 1:
122
+ num = int(sys.argv[1])
123
+ if num > 0:
124
+ print(f"--- evaluating {num} entries")
125
+ datasets["test"] = datasets["test"].select(range(num))
126
+
127
+ print_row_details(datasets["test"].to_pandas(), indices=[0, -1])
128
+
129
+ predictions = eval_model(
130
+ model,
131
+ tokenizer,
132
+ datasets["test"],
133
+ device=device,
134
+ batch_size=batch_size,
135
+ max_new_tokens=max_new_tokens,
136
+ )
137
+
138
+ model_name_with_rp = f"{model_name}/shots-{num_shots:02d}"
139
+
140
+ try:
141
+ on_num_shots_step_completed(
142
+ model_name_with_rp,
143
+ datasets["test"],
144
+ predictions,
145
+ )
146
+ except Exception as e:
147
+ print(e)
148
+
149
+
150
+ evaluate_model_with_num_shots(
151
+ model,
152
+ tokenizer,
153
+ model_name,
154
+ data_path,
155
+ batch_size=batch_size,
156
+ max_new_tokens=max_new_tokens,
157
+ device=device,
158
+ start_num_shots=start_num_shots,
159
+ )
160
+
161
+ if is_cuda:
162
+ gpu_stats = torch.cuda.get_device_properties(0)
163
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
164
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
165
+ print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
166
+ print(f"{start_gpu_memory} GB of memory reserved.")
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -294,7 +294,7 @@ def load_logical_reasoning_dataset(
294
 
295
  model_name = os.getenv("MODEL_NAME")
296
 
297
- if "mistral" in model_name.lower() or "gemma" in model_name.lower():
298
  messages = messages[1:]
299
 
300
  texts = []
 
294
 
295
  model_name = os.getenv("MODEL_NAME")
296
 
297
+ if "gemma" in model_name.lower():
298
  messages = messages[1:]
299
 
300
  texts = []
notebooks/05_Few-shot_Prompting_Anthropic.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/05b_Anthropic-Models_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/eval-mgtv-shots.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ BASEDIR=$(dirname "$0")
4
+ cd $BASEDIR/..
5
+ echo Current Directory:
6
+ pwd
7
+
8
+ BASEDIR=`pwd`
9
+
10
+ nvidia-smi
11
+ uname -a
12
+ cat /etc/os-release
13
+ lscpu
14
+ grep MemTotal /proc/meminfo
15
+
16
+ $BASEDIR/scripts/eval-shots.sh shenzhi-wang Mistral-7B-v0.3-Chinese-Chat
17
+
18
+ $BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat
19
+
20
+ $BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat-1m
21
+
22
+ $BASEDIR/scripts/eval-shots.sh Qwen Qwen2-7B-Instruct
23
+
24
+ $BASEDIR/scripts/eval-shots.sh shenzhi-wang Llama3.1-8B-Chinese-Chat
scripts/eval-shots.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ BASEDIR=$(dirname "$0")
4
+ cd $BASEDIR/..
5
+ echo Current Directory:
6
+ pwd
7
+
8
+ export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
9
+ export RESIZE_TOKEN_EMBEDDINGS=true
10
+ export USING_LLAMA_FACTORY=true
11
+ export USING_P1_PROMPT_TEMPLATE=false
12
+ export LOAD_IN_4BIT=false
13
+
14
+ export ORG_NAME=$1
15
+ export MODEL=$2
16
+ export MODEL_NAME=$ORG_NAME/$MODEL
17
+
18
+ export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
19
+
20
+ echo Evaluating $MODEL_NAME with few-shot learning
21
+ python llm_toolkit/eval_shots.py