YKKY
commited on
Commit
·
81f0fb3
1
Parent(s):
d09b361
add sample and test
Browse files- README.md +9 -0
- sample.py +34 -0
- task100.csv +0 -0
- task100.py +55 -0
README.md
CHANGED
@@ -8,5 +8,14 @@ ref: https://huggingface.co/elyza/ELYZA-japanese-Llama-2-13b-fast-instruct/blob/
|
|
8 |
量子化の詳細はconfig.jsonを参照してください。
|
9 |
コードはquantize.pyです。
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
## LICENSE
|
12 |
LICENSE: LLAMA2
|
|
|
8 |
量子化の詳細はconfig.jsonを参照してください。
|
9 |
コードはquantize.pyです。
|
10 |
|
11 |
+
|
12 |
+
## Test
|
13 |
+
elyza/ELYZA-tasks-100 の実行結果
|
14 |
+
→ task100.csv
|
15 |
+
|
16 |
+
実行時間
|
17 |
+
開始: 2024-05-09 03:49:30.779783
|
18 |
+
終了: 2024-05-09 03:56:56.380691
|
19 |
+
|
20 |
## LICENSE
|
21 |
LICENSE: LLAMA2
|
sample.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
+
|
4 |
+
B_INST, E_INST = "[INST]", "[/INST]"
|
5 |
+
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
6 |
+
DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
|
7 |
+
text = "仕事の熱意を取り戻すためのアイデアを5つ挙げてください。"
|
8 |
+
|
9 |
+
model_name = "./"
|
10 |
+
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
+
model = AutoModelForCausalLM.from_pretrained(
|
13 |
+
model_name,
|
14 |
+
use_cache=True,
|
15 |
+
device_map="auto"
|
16 |
+
)
|
17 |
+
model.eval()
|
18 |
+
|
19 |
+
prompt = "{bos_token}{b_inst} {system}{prompt} {e_inst} ".format(
|
20 |
+
bos_token=tokenizer.bos_token,
|
21 |
+
b_inst=B_INST,
|
22 |
+
system=f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}",
|
23 |
+
prompt=text,
|
24 |
+
e_inst=E_INST,
|
25 |
+
)
|
26 |
+
token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
|
27 |
+
output_ids = model.generate(
|
28 |
+
token_ids.to(model.device),
|
29 |
+
max_new_tokens=256,
|
30 |
+
pad_token_id=tokenizer.pad_token_id,
|
31 |
+
eos_token_id=tokenizer.eos_token_id,
|
32 |
+
)
|
33 |
+
output = tokenizer.decode(output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
|
34 |
+
print(output)
|
task100.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
task100.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from datasets import load_dataset
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
B_INST, E_INST = "[INST]", "[/INST]"
|
7 |
+
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
8 |
+
DEFAULT_SYSTEM_PROMPT = "あなたは誠実で優秀な日本人のアシスタントです。"
|
9 |
+
|
10 |
+
|
11 |
+
def main():
|
12 |
+
model_name = "./"
|
13 |
+
ds = load_dataset("elyza/ELYZA-tasks-100", revision="1.0.0")
|
14 |
+
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
+
model = AutoModelForCausalLM.from_pretrained(
|
17 |
+
model_name,
|
18 |
+
use_cache=True,
|
19 |
+
device_map="auto"
|
20 |
+
)
|
21 |
+
model.eval()
|
22 |
+
|
23 |
+
def pred(example):
|
24 |
+
prompt = "{bos_token}{b_inst} {system}{prompt}{e_inst} ".format(
|
25 |
+
bos_token=tokenizer.bos_token,
|
26 |
+
b_inst=B_INST,
|
27 |
+
system=f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}",
|
28 |
+
prompt=example["input"],
|
29 |
+
e_inst=E_INST,
|
30 |
+
)
|
31 |
+
|
32 |
+
token_ids = tokenizer.encode(
|
33 |
+
prompt, add_special_tokens=False, return_tensors="pt"
|
34 |
+
)
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
output_ids = model.generate(
|
38 |
+
token_ids.to(model.device),
|
39 |
+
max_new_tokens=1200,
|
40 |
+
pad_token_id=tokenizer.pad_token_id,
|
41 |
+
eos_token_id=tokenizer.eos_token_id,
|
42 |
+
)
|
43 |
+
output = tokenizer.decode(
|
44 |
+
output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True
|
45 |
+
)
|
46 |
+
example[model_name] = output
|
47 |
+
return example
|
48 |
+
print(datetime.datetime.now())
|
49 |
+
ds = ds.map(pred, batched=False)
|
50 |
+
print(datetime.datetime.now())
|
51 |
+
ds["test"].to_csv(f"preds/{model_name.replace('/', '-')}.csv", index=False)
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
main()
|