Spaces:
Runtime error
Runtime error
[20230821121100]
Browse files- .gitignore +6 -0
- README.md +2 -2
- examples/exercises/firefly_bloom_1b4/1.train_model.py +150 -0
- examples/exercises/firefly_bloom_1b4/2.test_sft_model.py +81 -0
- examples/exercises/firefly_bloom_1b4/ds_z3_config.json +46 -0
- examples/exercises/firefly_bloom_1b4/run.sh +192 -0
- examples/exercises/firefly_bloom_1b4/stop.sh +4 -0
- main.py +121 -0
- project_settings.py +20 -0
- requirements.txt +11 -0
- toolbox/json/__init__.py +6 -0
- toolbox/json/misc.py +63 -0
- toolbox/os/__init__.py +6 -0
- toolbox/os/environment.py +114 -0
- toolbox/os/other.py +9 -0
- trained_models/bloom-1b4-sft/config.json +33 -0
- trained_models/bloom-1b4-sft/pytorch_model.bin +3 -0
- trained_models/bloom-1b4-sft/special_tokens_map.json +6 -0
- trained_models/bloom-1b4-sft/tokenizer.json +0 -0
- trained_models/bloom-1b4-sft/tokenizer_config.json +11 -0
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.git/
|
3 |
+
.idea/
|
4 |
+
|
5 |
+
**/flagged/
|
6 |
+
**/__pycache__/
|
README.md
CHANGED
@@ -4,8 +4,8 @@ emoji: 🐠
|
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.20.1
|
8 |
+
app_file: main.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
examples/exercises/firefly_bloom_1b4/1.train_model.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import os
|
5 |
+
import platform
|
6 |
+
|
7 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
8 |
+
|
9 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
10 |
+
from transformers.data.data_collator import DataCollatorForLanguageModeling
|
11 |
+
from transformers import BloomTokenizerFast, BloomForCausalLM
|
12 |
+
from transformers.trainer import Trainer
|
13 |
+
from transformers.training_args import TrainingArguments
|
14 |
+
|
15 |
+
|
16 |
+
def get_args():
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument(
|
19 |
+
"--train_file",
|
20 |
+
# default='firefly-train-1.1M.jsonl',
|
21 |
+
default="D:/programmer/nlp_datasets/firefly-train-1.1M.jsonl",
|
22 |
+
type=str
|
23 |
+
)
|
24 |
+
parser.add_argument(
|
25 |
+
"--pretrained_model_name_or_path",
|
26 |
+
# default='YeungNLP/bloom-1b4-zh',
|
27 |
+
default="D:/programmer/nlp_pretrained_model/bloom-1b7",
|
28 |
+
type=str,
|
29 |
+
)
|
30 |
+
parser.add_argument("--cache_dir", default="cache_dir", type=str)
|
31 |
+
|
32 |
+
parser.add_argument("--output_dir", default="serialization_dir", type=str)
|
33 |
+
parser.add_argument("--overwrite_output_dir", action="store_true")
|
34 |
+
parser.add_argument("--evaluation_strategy", default="no", choices=["no", "steps", "epoch"], type=str)
|
35 |
+
parser.add_argument("--per_device_train_batch_size", default=4, type=int)
|
36 |
+
parser.add_argument("--gradient_accumulation_steps", default=4, type=int)
|
37 |
+
parser.add_argument("--learning_rate", default=1e-5, type=float)
|
38 |
+
parser.add_argument("--weight_decay", default=0, type=float)
|
39 |
+
parser.add_argument("--max_grad_norm", default=1.0, type=float)
|
40 |
+
parser.add_argument("--num_train_epochs", default=3.0, type=float)
|
41 |
+
parser.add_argument("--max_steps", default=-1, type=int)
|
42 |
+
parser.add_argument("--lr_scheduler_type", default="cosine", type=str)
|
43 |
+
parser.add_argument("--warmup_ratio", default=0.0, type=float)
|
44 |
+
parser.add_argument("--warmup_steps", default=3000, type=int)
|
45 |
+
parser.add_argument("--logging_steps", default=300, type=int)
|
46 |
+
parser.add_argument("--save_strategy", default="steps", type=str)
|
47 |
+
parser.add_argument("--save_steps", default=500, type=int)
|
48 |
+
parser.add_argument("--save_total_limit", default=3, type=int)
|
49 |
+
parser.add_argument("--no_cuda", action="store_true")
|
50 |
+
parser.add_argument("--seed", default=3407, type=str, help="https://arxiv.org/abs/2109.08203")
|
51 |
+
parser.add_argument("--fp16", action="store_true")
|
52 |
+
parser.add_argument("--half_precision_backend", default="auto", type=str)
|
53 |
+
parser.add_argument("--dataloader_num_workers", default=5, type=int)
|
54 |
+
parser.add_argument("--disable_tqdm", action="store_false")
|
55 |
+
parser.add_argument("--remove_unused_columns", action="store_false")
|
56 |
+
# parser.add_argument("--deepspeed", default="ds_z3_config.json", type=str)
|
57 |
+
parser.add_argument("--deepspeed", default=None, type=str)
|
58 |
+
parser.add_argument("--optim", default="adamw_hf", type=str)
|
59 |
+
parser.add_argument("--report_to", default="tensorboard", type=str)
|
60 |
+
parser.add_argument("--resume_from_checkpoint", default="file_dir/serialization_dir/checkpoint-103000", type=str)
|
61 |
+
parser.add_argument("--gradient_checkpointing", action="store_true")
|
62 |
+
# parser.add_argument("--gradient_checkpointing", action="store_false")
|
63 |
+
|
64 |
+
parser.add_argument("--truncate_longer_samples", action="store_true")
|
65 |
+
parser.add_argument("--max_seq_length", default=512, type=int)
|
66 |
+
|
67 |
+
args = parser.parse_args()
|
68 |
+
return args
|
69 |
+
|
70 |
+
|
71 |
+
def main():
|
72 |
+
args = get_args()
|
73 |
+
|
74 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
75 |
+
os.makedirs(args.cache_dir, exist_ok=True)
|
76 |
+
|
77 |
+
# dataset
|
78 |
+
dataset_dict = DatasetDict()
|
79 |
+
train_data_files = [args.train_file]
|
80 |
+
dataset_dict["train"] = load_dataset(
|
81 |
+
path="json", data_files=[str(file) for file in train_data_files]
|
82 |
+
)["train"]
|
83 |
+
print(dataset_dict)
|
84 |
+
|
85 |
+
# pretrained model
|
86 |
+
tokenizer = BloomTokenizerFast.from_pretrained(args.pretrained_model_name_or_path)
|
87 |
+
model = BloomForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
|
88 |
+
|
89 |
+
def encode_with_truncation(examples):
|
90 |
+
input_ = examples.pop("input")
|
91 |
+
target_ = examples.pop("target")
|
92 |
+
text = "<s>{input}</s></s>{target}</s>".format(input=input_, target=target_)
|
93 |
+
result = tokenizer(
|
94 |
+
text,
|
95 |
+
truncation=True,
|
96 |
+
# padding='max_length',
|
97 |
+
max_length=args.max_seq_length,
|
98 |
+
return_special_tokens_mask=True
|
99 |
+
)
|
100 |
+
return result
|
101 |
+
|
102 |
+
train_dataset = dataset_dict["train"].map(
|
103 |
+
encode_with_truncation,
|
104 |
+
batched=False,
|
105 |
+
keep_in_memory=False,
|
106 |
+
num_proc=None if platform.system() == "Windows" else os.cpu_count(),
|
107 |
+
cache_file_name=os.path.join(args.cache_dir, "train.cache")
|
108 |
+
)
|
109 |
+
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
|
110 |
+
|
111 |
+
print("Train Dataset Examples Batch Number: {}".format(len(train_dataset)))
|
112 |
+
|
113 |
+
# training
|
114 |
+
data_collator = DataCollatorForLanguageModeling(
|
115 |
+
tokenizer=tokenizer, mlm=False
|
116 |
+
)
|
117 |
+
training_args = TrainingArguments(
|
118 |
+
output_dir=args.output_dir,
|
119 |
+
overwrite_output_dir=args.overwrite_output_dir,
|
120 |
+
evaluation_strategy=args.evaluation_strategy,
|
121 |
+
per_device_train_batch_size=args.per_device_train_batch_size,
|
122 |
+
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
123 |
+
learning_rate=args.learning_rate,
|
124 |
+
num_train_epochs=args.num_train_epochs,
|
125 |
+
max_steps=args.max_steps,
|
126 |
+
lr_scheduler_type=args.lr_scheduler_type,
|
127 |
+
warmup_steps=args.warmup_steps,
|
128 |
+
logging_steps=args.logging_steps,
|
129 |
+
save_steps=args.save_steps,
|
130 |
+
save_total_limit=args.save_total_limit,
|
131 |
+
no_cuda=args.no_cuda,
|
132 |
+
fp16=args.fp16,
|
133 |
+
half_precision_backend=args.half_precision_backend,
|
134 |
+
# deepspeed=args.deepspeed,
|
135 |
+
report_to=args.report_to,
|
136 |
+
resume_from_checkpoint=args.resume_from_checkpoint,
|
137 |
+
gradient_checkpointing=args.gradient_checkpointing,
|
138 |
+
)
|
139 |
+
trainer = Trainer(
|
140 |
+
model=model,
|
141 |
+
args=training_args,
|
142 |
+
data_collator=data_collator,
|
143 |
+
train_dataset=train_dataset,
|
144 |
+
)
|
145 |
+
trainer.train()
|
146 |
+
return
|
147 |
+
|
148 |
+
|
149 |
+
if __name__ == '__main__':
|
150 |
+
main()
|
examples/exercises/firefly_bloom_1b4/2.test_sft_model.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
8 |
+
sys.path.append(os.path.join(pwd, '../../../'))
|
9 |
+
|
10 |
+
import torch
|
11 |
+
from transformers import BloomTokenizerFast, BloomForCausalLM
|
12 |
+
|
13 |
+
from project_settings import project_path
|
14 |
+
|
15 |
+
|
16 |
+
def get_args():
|
17 |
+
"""
|
18 |
+
python3 2.test_sft_model.py --trained_model_path /data/tianxing/PycharmProjects/Transformers/trained_models/bloom-396m-sft
|
19 |
+
python3 2.test_sft_model.py --trained_model_path /data/tianxing/PycharmProjects/Transformers/trained_models/bloom-1b4-sft
|
20 |
+
|
21 |
+
参考链接:
|
22 |
+
https://huggingface.co/YeungNLP/firefly-bloom-1b4
|
23 |
+
|
24 |
+
Example:
|
25 |
+
将下面句子翻译成现代文:\n石中央又生一树,高百余尺,条干偃阴为五色,翠叶如盘,花径尺余,色深碧,蕊深红,异香成烟,著物霏霏。
|
26 |
+
|
27 |
+
实体识别: 1949年10月1日,人们在北京天安门广场参加开国大典。
|
28 |
+
|
29 |
+
把这句话翻译成英文: 1949年10月1日,人们在北京天安门广场参加开国大典。
|
30 |
+
|
31 |
+
晚上睡不着该怎么办. 请给点详细的介绍.
|
32 |
+
|
33 |
+
将下面的句子翻译成文言文:结婚率下降, 离婚率暴增, 生育率下降, 人民焦虑迷茫, 到底是谁的错.
|
34 |
+
|
35 |
+
对联:厌烟沿檐烟燕眼. (污雾舞坞寤梧芜).
|
36 |
+
|
37 |
+
写一首咏雪的古诗, 标题为 "沁园春, 雪".
|
38 |
+
|
39 |
+
"""
|
40 |
+
parser = argparse.ArgumentParser()
|
41 |
+
parser.add_argument(
|
42 |
+
'--trained_model_path',
|
43 |
+
# default='YeungNLP/bloom-1b4-zh',
|
44 |
+
default=(project_path / "trained_models/bloom-1b4-sft").as_posix(),
|
45 |
+
type=str,
|
46 |
+
)
|
47 |
+
parser.add_argument('--device', default='auto', type=str)
|
48 |
+
|
49 |
+
args = parser.parse_args()
|
50 |
+
return args
|
51 |
+
|
52 |
+
|
53 |
+
def main():
|
54 |
+
args = get_args()
|
55 |
+
|
56 |
+
if args.device == 'auto':
|
57 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
58 |
+
else:
|
59 |
+
device = args.device
|
60 |
+
|
61 |
+
# pretrained model
|
62 |
+
tokenizer = BloomTokenizerFast.from_pretrained(args.trained_model_path)
|
63 |
+
model = BloomForCausalLM.from_pretrained(args.trained_model_path)
|
64 |
+
|
65 |
+
model.eval()
|
66 |
+
model = model.to(device)
|
67 |
+
text = input('User:')
|
68 |
+
while True:
|
69 |
+
text = '<s>{}</s></s>'.format(text)
|
70 |
+
input_ids = tokenizer(text, return_tensors="pt").input_ids
|
71 |
+
input_ids = input_ids.to(device)
|
72 |
+
outputs = model.generate(input_ids, max_new_tokens=200, do_sample=True, top_p=0.85, temperature=0.35,
|
73 |
+
repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
|
74 |
+
rets = tokenizer.batch_decode(outputs)
|
75 |
+
output = rets[0].strip().replace(text, "").replace('</s>', "")
|
76 |
+
print("LLM:{}".format(output))
|
77 |
+
text = input('User:')
|
78 |
+
|
79 |
+
|
80 |
+
if __name__ == '__main__':
|
81 |
+
main()
|
examples/exercises/firefly_bloom_1b4/ds_z3_config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gradient_accumulation_steps": "auto",
|
3 |
+
"gradient_clipping": "auto",
|
4 |
+
"steps_per_print": 200,
|
5 |
+
"train_batch_size": "auto",
|
6 |
+
"train_micro_batch_size_per_gpu": "auto",
|
7 |
+
"wall_clock_breakdown": false,
|
8 |
+
|
9 |
+
"optimizer": {
|
10 |
+
"type": "Adam",
|
11 |
+
"params": {
|
12 |
+
"lr": "auto",
|
13 |
+
"betas": "auto",
|
14 |
+
"eps": "auto",
|
15 |
+
"weight_decay": "auto"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"fp16": {
|
19 |
+
"enabled": "auto",
|
20 |
+
"loss_scale": 0,
|
21 |
+
"loss_scale_window": 1000,
|
22 |
+
"initial_scale_power": 16,
|
23 |
+
"hysteresis": 2,
|
24 |
+
"min_loss_scale": 1
|
25 |
+
},
|
26 |
+
"zero_optimization": {
|
27 |
+
"stage": 3,
|
28 |
+
"overlap_comm": true,
|
29 |
+
"contiguous_gradients": true,
|
30 |
+
"sub_group_size": 1e9,
|
31 |
+
"reduce_bucket_size": "auto",
|
32 |
+
"stage3_prefetch_bucket_size": "auto",
|
33 |
+
"stage3_param_persistence_threshold": "auto",
|
34 |
+
"stage3_max_live_parameters": 1e9,
|
35 |
+
"stage3_max_reuse_distance": 1e9,
|
36 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
37 |
+
},
|
38 |
+
"scheduler": {
|
39 |
+
"type": "WarmupLR",
|
40 |
+
"params": {
|
41 |
+
"warmup_min_lr": "auto",
|
42 |
+
"warmup_max_lr": "auto",
|
43 |
+
"warmup_num_steps": "auto"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
examples/exercises/firefly_bloom_1b4/run.sh
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
# sh run.sh --stage -1 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
|
4 |
+
# sh run.sh --stage -1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
|
5 |
+
# sh run.sh --stage 1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
|
6 |
+
# sh run.sh --stage 2 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
|
7 |
+
|
8 |
+
# sh run.sh --stage 1 --stop_stage 1 --system_version windows --pretrained_model_name bloom-1b4-zh
|
9 |
+
|
10 |
+
# params
|
11 |
+
system_version="windows";
|
12 |
+
verbose=true;
|
13 |
+
stage=0 # start from 0 if you need to start from data preparation
|
14 |
+
stop_stage=5
|
15 |
+
pretrained_model_supplier=YeungNLP
|
16 |
+
|
17 |
+
#pretrained_model_name=bloom-396m-zh
|
18 |
+
#pretrained_model_name=bloom-820m-zh
|
19 |
+
pretrained_model_name=bloom-1b4-zh
|
20 |
+
|
21 |
+
final_model_name=final_model_name
|
22 |
+
|
23 |
+
|
24 |
+
patience=0
|
25 |
+
|
26 |
+
|
27 |
+
# parse options
|
28 |
+
while true; do
|
29 |
+
[ -z "${1:-}" ] && break; # break if there are no arguments
|
30 |
+
case "$1" in
|
31 |
+
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
|
32 |
+
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
33 |
+
old_value="(eval echo \\$$name)";
|
34 |
+
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
|
35 |
+
was_bool=true;
|
36 |
+
else
|
37 |
+
was_bool=false;
|
38 |
+
fi
|
39 |
+
|
40 |
+
# Set the variable to the right value-- the escaped quotes make it work if
|
41 |
+
# the option had spaces, like --cmd "queue.pl -sync y"
|
42 |
+
eval "${name}=\"$2\"";
|
43 |
+
|
44 |
+
# Check that Boolean-valued arguments are really Boolean.
|
45 |
+
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
46 |
+
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
47 |
+
exit 1;
|
48 |
+
fi
|
49 |
+
shift 2;
|
50 |
+
;;
|
51 |
+
|
52 |
+
*) break;
|
53 |
+
esac
|
54 |
+
done
|
55 |
+
|
56 |
+
|
57 |
+
$verbose && echo "system_version: ${system_version}"
|
58 |
+
|
59 |
+
work_dir="$(pwd)"
|
60 |
+
file_dir="${work_dir}/file_dir"
|
61 |
+
cache_dir="${file_dir}/cache_dir"
|
62 |
+
serialization_dir="${file_dir}/serialization_dir"
|
63 |
+
|
64 |
+
data_dir="/data/tianxing/PycharmProjects/datasets/firefly_train_1_1m"
|
65 |
+
pretrained_models_dir="${work_dir}/../../../pretrained_models/huggingface/${pretrained_model_supplier}"
|
66 |
+
final_model_dir="${work_dir}/../../../trained_models/${final_model_name}";
|
67 |
+
|
68 |
+
mkdir -p "${file_dir}"
|
69 |
+
mkdir -p "${cache_dir}"
|
70 |
+
mkdir -p "${serialization_dir}"
|
71 |
+
mkdir -p "${data_dir}"
|
72 |
+
mkdir -p "${pretrained_models_dir}"
|
73 |
+
mkdir -p "${final_model_dir}"
|
74 |
+
|
75 |
+
export PYTHONPATH="${work_dir}/../../.."
|
76 |
+
|
77 |
+
if [ $system_version == "windows" ]; then
|
78 |
+
alias python3='C:/Users/tianx/PycharmProjects/virtualenv/Transformers/Scripts/python.exe'
|
79 |
+
elif [ $system_version == "centos" ]; then
|
80 |
+
# conda activate Transformers
|
81 |
+
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
|
82 |
+
elif [ $system_version == "ubuntu" ]; then
|
83 |
+
# conda activate Transformers
|
84 |
+
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
|
85 |
+
fi
|
86 |
+
|
87 |
+
|
88 |
+
function search_best_ckpt() {
|
89 |
+
patience="$1";
|
90 |
+
|
91 |
+
cd "${serialization_dir}" || exit 1
|
92 |
+
last_epoch=$(ls . | \
|
93 |
+
grep "checkpoint-*" | \
|
94 |
+
awk -F'[-]' '{print$2}' | \
|
95 |
+
sort -n | \
|
96 |
+
awk 'END {print}')
|
97 |
+
|
98 |
+
target_dir=
|
99 |
+
if [ -n "${last_epoch}" ]; then
|
100 |
+
target_epoch=$((last_epoch - patience))
|
101 |
+
|
102 |
+
for epoch_idx in $(ls . | grep "checkpoint-*" | awk -F'[-]' '{print$2}' | sort -nr):
|
103 |
+
do
|
104 |
+
if [ "${epoch_idx}" -le "${target_epoch}" ]; then
|
105 |
+
target_dir="checkpoint-${epoch_idx}";
|
106 |
+
break;
|
107 |
+
fi
|
108 |
+
done
|
109 |
+
fi
|
110 |
+
|
111 |
+
echo "${target_dir}"
|
112 |
+
}
|
113 |
+
|
114 |
+
|
115 |
+
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
116 |
+
$verbose && echo "stage -1: download data"
|
117 |
+
cd "${data_dir}" || exit 1;
|
118 |
+
|
119 |
+
firefly_train_1_1m_size=$(/bin/ls -l firefly-train-1.1M.jsonl | awk '{print $5}')
|
120 |
+
if [ ! -e firefly-train-1.1M.jsonl ] || [ "${firefly_train_1_1m_size}" != "1171119212" ]; then
|
121 |
+
# rm firefly-train-1.1M.jsonl
|
122 |
+
wget -c https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M/resolve/main/firefly-train-1.1M.jsonl
|
123 |
+
fi
|
124 |
+
|
125 |
+
fi
|
126 |
+
|
127 |
+
|
128 |
+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
129 |
+
$verbose && echo "stage 0: download pretrained model"
|
130 |
+
cd "${work_dir}" || exit 1;
|
131 |
+
cd "${pretrained_models_dir}" || exit 1;
|
132 |
+
|
133 |
+
if [ ! -d "${pretrained_model_name}" ]; then
|
134 |
+
git clone "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/"
|
135 |
+
|
136 |
+
cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
|
137 |
+
rm -rf .git
|
138 |
+
rm -rf flax_model.msgpack
|
139 |
+
rm -rf model.safetensors
|
140 |
+
rm -rf pytorch_model.bin
|
141 |
+
rm -rf tokenizer.json
|
142 |
+
|
143 |
+
fi
|
144 |
+
|
145 |
+
cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
|
146 |
+
if [ ! -e pytorch_model.bin ]; then
|
147 |
+
wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/pytorch_model.bin"
|
148 |
+
fi
|
149 |
+
|
150 |
+
if [ ! -e tokenizer.json ]; then
|
151 |
+
wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/tokenizer.json"
|
152 |
+
fi
|
153 |
+
|
154 |
+
fi
|
155 |
+
|
156 |
+
|
157 |
+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
158 |
+
$verbose && echo "stage 1: train model"
|
159 |
+
cd "${work_dir}" || exit 1;
|
160 |
+
target_dir=$(search_best_ckpt "${patience}");
|
161 |
+
|
162 |
+
resume_from_checkpoint=
|
163 |
+
if [ -n "${target_dir}" ]; then
|
164 |
+
resume_from_checkpoint="${serialization_dir}/${target_dir}"
|
165 |
+
echo "resume_from_checkpoint: ${resume_from_checkpoint}"
|
166 |
+
fi
|
167 |
+
|
168 |
+
python3 1.train_model.py \
|
169 |
+
--train_file "${data_dir}/firefly-train-1.1M.jsonl" \
|
170 |
+
--pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
|
171 |
+
--output_dir "${serialization_dir}" \
|
172 |
+
--cache_dir "${cache_dir}" \
|
173 |
+
--fp16 \
|
174 |
+
${resume_from_checkpoint:+--resume_from_checkpoint $resume_from_checkpoint}
|
175 |
+
|
176 |
+
fi
|
177 |
+
|
178 |
+
|
179 |
+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
180 |
+
$verbose && echo "stage 2: collect files"
|
181 |
+
target_dir=$(search_best_ckpt "${patience}");
|
182 |
+
|
183 |
+
cd "${work_dir}" || exit 1;
|
184 |
+
|
185 |
+
cp "${serialization_dir}/${target_dir}/pytorch_model.bin" "${final_model_dir}/pytorch_model.bin"
|
186 |
+
|
187 |
+
cp "${pretrained_models_dir}/${pretrained_model_name}/config.json" "${final_model_dir}/config.json"
|
188 |
+
cp "${pretrained_models_dir}/${pretrained_model_name}/special_tokens_map.json" "${final_model_dir}/special_tokens_map.json"
|
189 |
+
cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer_config.json" "${final_model_dir}/tokenizer_config.json"
|
190 |
+
cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer.json" "${final_model_dir}/tokenizer.json"
|
191 |
+
|
192 |
+
fi
|
examples/exercises/firefly_bloom_1b4/stop.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
kill -9 `ps -aef | grep 'Transformers/bin/python3' | grep -v grep | awk '{print $2}' | sed 's/\n/ /'`
|
4 |
+
kill -9 `ps -aef | grep 'run.sh' | grep -v grep | awk '{print $2}' | sed 's/\n/ /'`
|
main.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import torch
|
7 |
+
from transformers import BloomTokenizerFast, BloomForCausalLM
|
8 |
+
|
9 |
+
from project_settings import project_path
|
10 |
+
|
11 |
+
|
12 |
+
def get_args():
|
13 |
+
parser = argparse.ArgumentParser()
|
14 |
+
parser.add_argument(
|
15 |
+
'--trained_model_path',
|
16 |
+
default=(project_path / "trained_models/bloom-1b4-sft").as_posix(),
|
17 |
+
type=str,
|
18 |
+
)
|
19 |
+
parser.add_argument('--device', default='auto', type=str)
|
20 |
+
|
21 |
+
args = parser.parse_args()
|
22 |
+
return args
|
23 |
+
|
24 |
+
|
25 |
+
def main():
|
26 |
+
args = get_args()
|
27 |
+
|
28 |
+
if args.device == 'auto':
|
29 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
30 |
+
else:
|
31 |
+
device = args.device
|
32 |
+
|
33 |
+
# pretrained model
|
34 |
+
tokenizer = BloomTokenizerFast.from_pretrained(args.trained_model_path)
|
35 |
+
model = BloomForCausalLM.from_pretrained(args.trained_model_path)
|
36 |
+
|
37 |
+
description = """
|
38 |
+
FireflyBloom1b4
|
39 |
+
|
40 |
+
基于 [YeungNLP/bloom-1b4-zh](https://huggingface.co/YeungNLP/bloom-1b4-zh) 预训练模型,
|
41 |
+
基于 [YeungNLP/firefly-train-1.1M](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) 数据集,
|
42 |
+
训练的等同于 [YeungNLP/firefly-bloom-1b4](https://huggingface.co/YeungNLP/firefly-bloom-1b4) 的问答模型.
|
43 |
+
|
44 |
+
训练代码是自己编写的, 在 examples 里, 总共训练了 3 个 epoch. 感觉效果还可以.
|
45 |
+
|
46 |
+
"""
|
47 |
+
|
48 |
+
def fn(text: str,
|
49 |
+
max_new_tokens: int = 200,
|
50 |
+
top_p: float = 0.85,
|
51 |
+
temperature: float = 0.35,
|
52 |
+
repetition_penalty: float = 1.2
|
53 |
+
):
|
54 |
+
print(text)
|
55 |
+
text = '<s>{}</s></s>'.format(text)
|
56 |
+
input_ids = tokenizer(text, return_tensors="pt").input_ids
|
57 |
+
input_ids = input_ids.to(device)
|
58 |
+
outputs = model.generate(input_ids,
|
59 |
+
max_new_tokens=max_new_tokens,
|
60 |
+
do_sample=True,
|
61 |
+
top_p=top_p,
|
62 |
+
temperature=temperature,
|
63 |
+
repetition_penalty=repetition_penalty,
|
64 |
+
eos_token_id=tokenizer.eos_token_id
|
65 |
+
)
|
66 |
+
rets = tokenizer.batch_decode(outputs)
|
67 |
+
output = rets[0].strip().replace(text, "").replace('</s>', "")
|
68 |
+
print(output)
|
69 |
+
return output
|
70 |
+
|
71 |
+
demo = gr.Interface(
|
72 |
+
fn=fn,
|
73 |
+
inputs=[
|
74 |
+
gr.Text(label="text"),
|
75 |
+
gr.Number(value=200, label="max_new_tokens"),
|
76 |
+
gr.Slider(minimum=0, maximum=1, value=0.85, label="top_p"),
|
77 |
+
gr.Slider(minimum=0, maximum=1, value=0.35, label="temperature"),
|
78 |
+
gr.Number(value=1.2, label="repetition_penalty"),
|
79 |
+
],
|
80 |
+
outputs=[gr.Text(label="output")],
|
81 |
+
examples=[
|
82 |
+
[
|
83 |
+
"将下面句子翻译成现代文:\n石中央又生一树,高百余尺,条干偃阴为五色,翠叶如盘,花径尺余,色深碧,蕊深红,异香成烟,著物霏霏。",
|
84 |
+
200, 0.85, 0.35, 1.2
|
85 |
+
],
|
86 |
+
[
|
87 |
+
"实体识别: 1949年10月1日,人们在北京天安门广场参加开国大典。",
|
88 |
+
200, 0.85, 0.35, 1.2
|
89 |
+
],
|
90 |
+
[
|
91 |
+
"把这句话翻译成英文: 1949年10月1日,人们在北京天安门广场参加开国大典。",
|
92 |
+
200, 0.85, 0.35, 1.2
|
93 |
+
],
|
94 |
+
[
|
95 |
+
"晚上睡不着该怎么办. 请给点详细的介绍.",
|
96 |
+
200, 0.85, 0.35, 1.2
|
97 |
+
],
|
98 |
+
[
|
99 |
+
"将下面的句子翻译成文言文:结婚率下降, 离婚率暴增, 生育率下降, 人民焦虑迷茫, 到底是谁的错.",
|
100 |
+
200, 0.85, 0.35, 1.2
|
101 |
+
],
|
102 |
+
[
|
103 |
+
"对联:厌烟沿檐烟燕眼.",
|
104 |
+
200, 0.85, 0.35, 1.2
|
105 |
+
],
|
106 |
+
[
|
107 |
+
"写一首咏雪的古诗, 标题为 \"沁园春, 雪\".",
|
108 |
+
200, 0.85, 0.35, 1.2
|
109 |
+
],
|
110 |
+
],
|
111 |
+
examples_per_page=50,
|
112 |
+
title="Firefly Bloom 1b4",
|
113 |
+
description=description,
|
114 |
+
)
|
115 |
+
demo.launch()
|
116 |
+
|
117 |
+
return
|
118 |
+
|
119 |
+
|
120 |
+
if __name__ == '__main__':
|
121 |
+
main()
|
project_settings.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from toolbox.os.environment import EnvironmentManager
|
7 |
+
|
8 |
+
|
9 |
+
project_path = os.path.abspath(os.path.dirname(__file__))
|
10 |
+
project_path = Path(project_path)
|
11 |
+
|
12 |
+
|
13 |
+
environment = EnvironmentManager(
|
14 |
+
path=os.path.join(project_path, 'dotenv'),
|
15 |
+
env=os.environ.get('environment', 'dev'),
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
pass
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.20.1
|
2 |
+
pydantic==1.10.12
|
3 |
+
thinc==7.4.6
|
4 |
+
spacy==2.3.9
|
5 |
+
transformers==4.30.2
|
6 |
+
numpy==1.21.4
|
7 |
+
pandas==1.2.5
|
8 |
+
tqdm==4.62.3
|
9 |
+
torch==1.13.0
|
10 |
+
datasets
|
11 |
+
python-dotenv==1.0.0
|
toolbox/json/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
pass
|
toolbox/json/misc.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from typing import Callable
|
4 |
+
|
5 |
+
|
6 |
+
def traverse(js, callback: Callable, *args, **kwargs):
|
7 |
+
if isinstance(js, list):
|
8 |
+
result = list()
|
9 |
+
for l in js:
|
10 |
+
l = traverse(l, callback, *args, **kwargs)
|
11 |
+
result.append(l)
|
12 |
+
return result
|
13 |
+
elif isinstance(js, tuple):
|
14 |
+
result = list()
|
15 |
+
for l in js:
|
16 |
+
l = traverse(l, callback, *args, **kwargs)
|
17 |
+
result.append(l)
|
18 |
+
return tuple(result)
|
19 |
+
elif isinstance(js, dict):
|
20 |
+
result = dict()
|
21 |
+
for k, v in js.items():
|
22 |
+
k = traverse(k, callback, *args, **kwargs)
|
23 |
+
v = traverse(v, callback, *args, **kwargs)
|
24 |
+
result[k] = v
|
25 |
+
return result
|
26 |
+
elif isinstance(js, int):
|
27 |
+
return callback(js, *args, **kwargs)
|
28 |
+
elif isinstance(js, str):
|
29 |
+
return callback(js, *args, **kwargs)
|
30 |
+
else:
|
31 |
+
return js
|
32 |
+
|
33 |
+
|
34 |
+
def demo1():
|
35 |
+
d = {
|
36 |
+
"env": "ppe",
|
37 |
+
"mysql_connect": {
|
38 |
+
"host": "$mysql_connect_host",
|
39 |
+
"port": 3306,
|
40 |
+
"user": "callbot",
|
41 |
+
"password": "NxcloudAI2021!",
|
42 |
+
"database": "callbot_ppe",
|
43 |
+
"charset": "utf8"
|
44 |
+
},
|
45 |
+
"es_connect": {
|
46 |
+
"hosts": ["10.20.251.8"],
|
47 |
+
"http_auth": ["elastic", "ElasticAI2021!"],
|
48 |
+
"port": 9200
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
def callback(s):
|
53 |
+
if isinstance(s, str) and s.startswith('$'):
|
54 |
+
return s[1:]
|
55 |
+
return s
|
56 |
+
|
57 |
+
result = traverse(d, callback=callback)
|
58 |
+
print(result)
|
59 |
+
return
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
demo1()
|
toolbox/os/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
pass
|
toolbox/os/environment.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from dotenv.main import DotEnv
|
8 |
+
|
9 |
+
from toolbox.json.misc import traverse
|
10 |
+
|
11 |
+
|
12 |
+
class EnvironmentManager(object):
|
13 |
+
def __init__(self, path, env, override=False):
|
14 |
+
filename = os.path.join(path, '{}.env'.format(env))
|
15 |
+
self.filename = filename
|
16 |
+
|
17 |
+
load_dotenv(
|
18 |
+
dotenv_path=filename,
|
19 |
+
override=override
|
20 |
+
)
|
21 |
+
|
22 |
+
self._environ = dict()
|
23 |
+
|
24 |
+
def open_dotenv(self, filename: str = None):
|
25 |
+
filename = filename or self.filename
|
26 |
+
dotenv = DotEnv(
|
27 |
+
dotenv_path=filename,
|
28 |
+
stream=None,
|
29 |
+
verbose=False,
|
30 |
+
interpolate=False,
|
31 |
+
override=False,
|
32 |
+
encoding="utf-8",
|
33 |
+
)
|
34 |
+
result = dotenv.dict()
|
35 |
+
return result
|
36 |
+
|
37 |
+
def get(self, key, default=None, dtype=str):
|
38 |
+
result = os.environ.get(key)
|
39 |
+
if result is None:
|
40 |
+
if default is None:
|
41 |
+
result = None
|
42 |
+
else:
|
43 |
+
result = default
|
44 |
+
else:
|
45 |
+
result = dtype(result)
|
46 |
+
self._environ[key] = result
|
47 |
+
return result
|
48 |
+
|
49 |
+
|
50 |
+
_DEFAULT_DTYPE_MAP = {
|
51 |
+
'int': int,
|
52 |
+
'float': float,
|
53 |
+
'str': str,
|
54 |
+
'json.loads': json.loads
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
class JsonConfig(object):
|
59 |
+
"""
|
60 |
+
将 json 中, 形如 `$float:threshold` 的值, 处理为:
|
61 |
+
从环境变量中查到 threshold, 再将其转换为 float 类型.
|
62 |
+
"""
|
63 |
+
def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
|
64 |
+
self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
|
65 |
+
self.environment = environment or os.environ
|
66 |
+
|
67 |
+
def sanitize_by_filename(self, filename: str):
|
68 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
69 |
+
js = json.load(f)
|
70 |
+
|
71 |
+
return self.sanitize_by_json(js)
|
72 |
+
|
73 |
+
def sanitize_by_json(self, js):
|
74 |
+
js = traverse(
|
75 |
+
js,
|
76 |
+
callback=self.sanitize,
|
77 |
+
environment=self.environment
|
78 |
+
)
|
79 |
+
return js
|
80 |
+
|
81 |
+
def sanitize(self, string, environment):
|
82 |
+
"""支持 $ 符开始的, 环境变量配置"""
|
83 |
+
if isinstance(string, str) and string.startswith('$'):
|
84 |
+
dtype, key = string[1:].split(':')
|
85 |
+
dtype = self.dtype_map[dtype]
|
86 |
+
|
87 |
+
value = environment.get(key)
|
88 |
+
if value is None:
|
89 |
+
raise AssertionError('environment not exist. key: {}'.format(key))
|
90 |
+
|
91 |
+
value = dtype(value)
|
92 |
+
result = value
|
93 |
+
else:
|
94 |
+
result = string
|
95 |
+
return result
|
96 |
+
|
97 |
+
|
98 |
+
def demo1():
|
99 |
+
import json
|
100 |
+
|
101 |
+
from project_settings import project_path
|
102 |
+
|
103 |
+
environment = EnvironmentManager(
|
104 |
+
path=os.path.join(project_path, 'server/callbot_server/dotenv'),
|
105 |
+
env='dev',
|
106 |
+
)
|
107 |
+
init_scenes = environment.get(key='init_scenes', dtype=json.loads)
|
108 |
+
print(init_scenes)
|
109 |
+
print(environment._environ)
|
110 |
+
return
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == '__main__':
|
114 |
+
demo1()
|
toolbox/os/other.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import inspect
|
3 |
+
|
4 |
+
|
5 |
+
def pwd():
|
6 |
+
"""你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
|
7 |
+
frame = inspect.stack()[1]
|
8 |
+
module = inspect.getmodule(frame[0])
|
9 |
+
return os.path.dirname(os.path.abspath(module.__file__))
|
trained_models/bloom-1b4-sft/config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "YeungNLP/bloom-1b4-zh",
|
3 |
+
"apply_residual_connection_post_layernorm": false,
|
4 |
+
"architectures": [
|
5 |
+
"BloomForCausalLM"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"attention_softmax_in_fp32": true,
|
9 |
+
"bias_dropout_fusion": true,
|
10 |
+
"bos_token_id": 1,
|
11 |
+
"eos_token_id": 2,
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"layer_norm_epsilon": 1e-05,
|
15 |
+
"masked_softmax_fusion": true,
|
16 |
+
"model_type": "bloom",
|
17 |
+
"n_embed": 2048,
|
18 |
+
"n_inner": null,
|
19 |
+
"n_layer": 24,
|
20 |
+
"num_attention_heads": 16,
|
21 |
+
"offset_alibi": 100,
|
22 |
+
"pad_token_id": 3,
|
23 |
+
"pretraining_tp": 2,
|
24 |
+
"seq_length": 4096,
|
25 |
+
"skip_bias_add": true,
|
26 |
+
"skip_bias_add_qkv": false,
|
27 |
+
"slow_but_exact": false,
|
28 |
+
"torch_dtype": "float16",
|
29 |
+
"transformers_version": "4.20.0",
|
30 |
+
"unk_token_id": 0,
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 46145
|
33 |
+
}
|
trained_models/bloom-1b4-sft/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b3d273ca775ee59871273c92c84803369423f045987057c3152b85f92244a6d
|
3 |
+
size 5212546569
|
trained_models/bloom-1b4-sft/special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"pad_token": "<pad>",
|
5 |
+
"unk_token": "<unk>"
|
6 |
+
}
|
trained_models/bloom-1b4-sft/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
trained_models/bloom-1b4-sft/tokenizer_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"name_or_path": "/Users/jianxin.yang/Desktop/pretrain_models/bloom-6b4-zh",
|
6 |
+
"pad_token": "<pad>",
|
7 |
+
"padding_side": "left",
|
8 |
+
"special_tokens_map_file": null,
|
9 |
+
"tokenizer_class": "BloomTokenizer",
|
10 |
+
"unk_token": "<unk>"
|
11 |
+
}
|