YKKY commited on
Commit
81f0fb3
·
1 Parent(s): d09b361

add sample and test

Browse files
Files changed (4) hide show
  1. README.md +9 -0
  2. sample.py +34 -0
  3. task100.csv +0 -0
  4. task100.py +55 -0
README.md CHANGED
@@ -8,5 +8,14 @@ ref: https://huggingface.co/elyza/ELYZA-japanese-Llama-2-13b-fast-instruct/blob/
8
  量子化の詳細はconfig.jsonを参照してください。
9
  コードはquantize.pyです。
10
 
 
 
 
 
 
 
 
 
 
11
  ## LICENSE
12
  LICENSE: LLAMA2
 
8
  量子化の詳細はconfig.jsonを参照してください。
9
  コードはquantize.pyです。
10
 
11
+
12
+ ## Test
13
+ elyza/ELYZA-tasks-100 の実行結果
14
+ → task100.csv
15
+
16
+ 実行時間
17
+ 開始: 2024-05-09 03:49:30.779783
18
+ 終了: 2024-05-09 03:56:56.380691
19
+
20
  ## LICENSE
21
  LICENSE: LLAMA2
sample.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+
4
+ B_INST, E_INST = "[INST]", "[/INST]"
5
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
6
+ DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
7
+ text = "仕事の熱意を取り戻すためのアイデアを5つ挙げてください。"
8
+
9
+ model_name = "./"
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ model_name,
14
+ use_cache=True,
15
+ device_map="auto"
16
+ )
17
+ model.eval()
18
+
19
+ prompt = "{bos_token}{b_inst} {system}{prompt} {e_inst} ".format(
20
+ bos_token=tokenizer.bos_token,
21
+ b_inst=B_INST,
22
+ system=f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}",
23
+ prompt=text,
24
+ e_inst=E_INST,
25
+ )
26
+ token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
27
+ output_ids = model.generate(
28
+ token_ids.to(model.device),
29
+ max_new_tokens=256,
30
+ pad_token_id=tokenizer.pad_token_id,
31
+ eos_token_id=tokenizer.eos_token_id,
32
+ )
33
+ output = tokenizer.decode(output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
34
+ print(output)
task100.csv ADDED
The diff for this file is too large to render. See raw diff
 
task100.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_dataset
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import datetime
5
+
6
+ B_INST, E_INST = "[INST]", "[/INST]"
7
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
8
+ DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
9
+
10
+
11
+ def main():
12
+ model_name = "./"
13
+ ds = load_dataset("elyza/ELYZA-tasks-100", revision="1.0.0")
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ model_name,
18
+ use_cache=True,
19
+ device_map="auto"
20
+ )
21
+ model.eval()
22
+
23
+ def pred(example):
24
+ prompt = "{bos_token}{b_inst} {system}{prompt}{e_inst} ".format(
25
+ bos_token=tokenizer.bos_token,
26
+ b_inst=B_INST,
27
+ system=f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}",
28
+ prompt=example["input"],
29
+ e_inst=E_INST,
30
+ )
31
+
32
+ token_ids = tokenizer.encode(
33
+ prompt, add_special_tokens=False, return_tensors="pt"
34
+ )
35
+
36
+ with torch.no_grad():
37
+ output_ids = model.generate(
38
+ token_ids.to(model.device),
39
+ max_new_tokens=1200,
40
+ pad_token_id=tokenizer.pad_token_id,
41
+ eos_token_id=tokenizer.eos_token_id,
42
+ )
43
+ output = tokenizer.decode(
44
+ output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True
45
+ )
46
+ example[model_name] = output
47
+ return example
48
+ print(datetime.datetime.now())
49
+ ds = ds.map(pred, batched=False)
50
+ print(datetime.datetime.now())
51
+ ds["test"].to_csv(f"preds/{model_name.replace('/', '-')}.csv", index=False)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()