qgyd2021 commited on
Commit
f6ff4fa
1 Parent(s): 546fb9a

[20230821121100]

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ **/flagged/
6
+ **/__pycache__/
README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 🐠
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.40.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.20.1
8
+ app_file: main.py
9
  pinned: false
10
  ---
11
 
examples/exercises/firefly_bloom_1b4/1.train_model.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ import platform
6
+
7
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
8
+
9
+ from datasets import Dataset, DatasetDict, load_dataset
10
+ from transformers.data.data_collator import DataCollatorForLanguageModeling
11
+ from transformers import BloomTokenizerFast, BloomForCausalLM
12
+ from transformers.trainer import Trainer
13
+ from transformers.training_args import TrainingArguments
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--train_file",
20
+ # default='firefly-train-1.1M.jsonl',
21
+ default="D:/programmer/nlp_datasets/firefly-train-1.1M.jsonl",
22
+ type=str
23
+ )
24
+ parser.add_argument(
25
+ "--pretrained_model_name_or_path",
26
+ # default='YeungNLP/bloom-1b4-zh',
27
+ default="D:/programmer/nlp_pretrained_model/bloom-1b7",
28
+ type=str,
29
+ )
30
+ parser.add_argument("--cache_dir", default="cache_dir", type=str)
31
+
32
+ parser.add_argument("--output_dir", default="serialization_dir", type=str)
33
+ parser.add_argument("--overwrite_output_dir", action="store_true")
34
+ parser.add_argument("--evaluation_strategy", default="no", choices=["no", "steps", "epoch"], type=str)
35
+ parser.add_argument("--per_device_train_batch_size", default=4, type=int)
36
+ parser.add_argument("--gradient_accumulation_steps", default=4, type=int)
37
+ parser.add_argument("--learning_rate", default=1e-5, type=float)
38
+ parser.add_argument("--weight_decay", default=0, type=float)
39
+ parser.add_argument("--max_grad_norm", default=1.0, type=float)
40
+ parser.add_argument("--num_train_epochs", default=3.0, type=float)
41
+ parser.add_argument("--max_steps", default=-1, type=int)
42
+ parser.add_argument("--lr_scheduler_type", default="cosine", type=str)
43
+ parser.add_argument("--warmup_ratio", default=0.0, type=float)
44
+ parser.add_argument("--warmup_steps", default=3000, type=int)
45
+ parser.add_argument("--logging_steps", default=300, type=int)
46
+ parser.add_argument("--save_strategy", default="steps", type=str)
47
+ parser.add_argument("--save_steps", default=500, type=int)
48
+ parser.add_argument("--save_total_limit", default=3, type=int)
49
+ parser.add_argument("--no_cuda", action="store_true")
50
+ parser.add_argument("--seed", default=3407, type=str, help="https://arxiv.org/abs/2109.08203")
51
+ parser.add_argument("--fp16", action="store_true")
52
+ parser.add_argument("--half_precision_backend", default="auto", type=str)
53
+ parser.add_argument("--dataloader_num_workers", default=5, type=int)
54
+ parser.add_argument("--disable_tqdm", action="store_false")
55
+ parser.add_argument("--remove_unused_columns", action="store_false")
56
+ # parser.add_argument("--deepspeed", default="ds_z3_config.json", type=str)
57
+ parser.add_argument("--deepspeed", default=None, type=str)
58
+ parser.add_argument("--optim", default="adamw_hf", type=str)
59
+ parser.add_argument("--report_to", default="tensorboard", type=str)
60
+ parser.add_argument("--resume_from_checkpoint", default="file_dir/serialization_dir/checkpoint-103000", type=str)
61
+ parser.add_argument("--gradient_checkpointing", action="store_true")
62
+ # parser.add_argument("--gradient_checkpointing", action="store_false")
63
+
64
+ parser.add_argument("--truncate_longer_samples", action="store_true")
65
+ parser.add_argument("--max_seq_length", default=512, type=int)
66
+
67
+ args = parser.parse_args()
68
+ return args
69
+
70
+
71
+ def main():
72
+ args = get_args()
73
+
74
+ os.makedirs(args.output_dir, exist_ok=True)
75
+ os.makedirs(args.cache_dir, exist_ok=True)
76
+
77
+ # dataset
78
+ dataset_dict = DatasetDict()
79
+ train_data_files = [args.train_file]
80
+ dataset_dict["train"] = load_dataset(
81
+ path="json", data_files=[str(file) for file in train_data_files]
82
+ )["train"]
83
+ print(dataset_dict)
84
+
85
+ # pretrained model
86
+ tokenizer = BloomTokenizerFast.from_pretrained(args.pretrained_model_name_or_path)
87
+ model = BloomForCausalLM.from_pretrained(args.pretrained_model_name_or_path)
88
+
89
+ def encode_with_truncation(examples):
90
+ input_ = examples.pop("input")
91
+ target_ = examples.pop("target")
92
+ text = "<s>{input}</s></s>{target}</s>".format(input=input_, target=target_)
93
+ result = tokenizer(
94
+ text,
95
+ truncation=True,
96
+ # padding='max_length',
97
+ max_length=args.max_seq_length,
98
+ return_special_tokens_mask=True
99
+ )
100
+ return result
101
+
102
+ train_dataset = dataset_dict["train"].map(
103
+ encode_with_truncation,
104
+ batched=False,
105
+ keep_in_memory=False,
106
+ num_proc=None if platform.system() == "Windows" else os.cpu_count(),
107
+ cache_file_name=os.path.join(args.cache_dir, "train.cache")
108
+ )
109
+ train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
110
+
111
+ print("Train Dataset Examples Batch Number: {}".format(len(train_dataset)))
112
+
113
+ # training
114
+ data_collator = DataCollatorForLanguageModeling(
115
+ tokenizer=tokenizer, mlm=False
116
+ )
117
+ training_args = TrainingArguments(
118
+ output_dir=args.output_dir,
119
+ overwrite_output_dir=args.overwrite_output_dir,
120
+ evaluation_strategy=args.evaluation_strategy,
121
+ per_device_train_batch_size=args.per_device_train_batch_size,
122
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
123
+ learning_rate=args.learning_rate,
124
+ num_train_epochs=args.num_train_epochs,
125
+ max_steps=args.max_steps,
126
+ lr_scheduler_type=args.lr_scheduler_type,
127
+ warmup_steps=args.warmup_steps,
128
+ logging_steps=args.logging_steps,
129
+ save_steps=args.save_steps,
130
+ save_total_limit=args.save_total_limit,
131
+ no_cuda=args.no_cuda,
132
+ fp16=args.fp16,
133
+ half_precision_backend=args.half_precision_backend,
134
+ # deepspeed=args.deepspeed,
135
+ report_to=args.report_to,
136
+ resume_from_checkpoint=args.resume_from_checkpoint,
137
+ gradient_checkpointing=args.gradient_checkpointing,
138
+ )
139
+ trainer = Trainer(
140
+ model=model,
141
+ args=training_args,
142
+ data_collator=data_collator,
143
+ train_dataset=train_dataset,
144
+ )
145
+ trainer.train()
146
+ return
147
+
148
+
149
+ if __name__ == '__main__':
150
+ main()
examples/exercises/firefly_bloom_1b4/2.test_sft_model.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ pwd = os.path.abspath(os.path.dirname(__file__))
8
+ sys.path.append(os.path.join(pwd, '../../../'))
9
+
10
+ import torch
11
+ from transformers import BloomTokenizerFast, BloomForCausalLM
12
+
13
+ from project_settings import project_path
14
+
15
+
16
+ def get_args():
17
+ """
18
+ python3 2.test_sft_model.py --trained_model_path /data/tianxing/PycharmProjects/Transformers/trained_models/bloom-396m-sft
19
+ python3 2.test_sft_model.py --trained_model_path /data/tianxing/PycharmProjects/Transformers/trained_models/bloom-1b4-sft
20
+
21
+ 参考链接:
22
+ https://huggingface.co/YeungNLP/firefly-bloom-1b4
23
+
24
+ Example:
25
+ 将下面句子翻译成现代文:\n石中央又生一树,高百余尺,条干偃阴为五色,翠叶如盘,花径尺余,色深碧,蕊深红,异香成烟,著物霏霏。
26
+
27
+ 实体识别: 1949年10月1日,人们在北京天安门广场参加开国大典。
28
+
29
+ 把这句话翻译成英文: 1949年10月1日,人们在北京天安门广场参加开国大典。
30
+
31
+ 晚上睡不着该怎么办. 请给点详细的介绍.
32
+
33
+ 将下面的句子翻译成文言文:结婚率下降, 离婚率暴增, 生育率下降, 人民焦虑迷茫, 到底是谁的错.
34
+
35
+ 对联:厌烟沿檐烟燕眼. (污雾舞坞寤梧芜).
36
+
37
+ 写一首咏雪的古诗, 标题为 "沁园春, 雪".
38
+
39
+ """
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument(
42
+ '--trained_model_path',
43
+ # default='YeungNLP/bloom-1b4-zh',
44
+ default=(project_path / "trained_models/bloom-1b4-sft").as_posix(),
45
+ type=str,
46
+ )
47
+ parser.add_argument('--device', default='auto', type=str)
48
+
49
+ args = parser.parse_args()
50
+ return args
51
+
52
+
53
+ def main():
54
+ args = get_args()
55
+
56
+ if args.device == 'auto':
57
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
58
+ else:
59
+ device = args.device
60
+
61
+ # pretrained model
62
+ tokenizer = BloomTokenizerFast.from_pretrained(args.trained_model_path)
63
+ model = BloomForCausalLM.from_pretrained(args.trained_model_path)
64
+
65
+ model.eval()
66
+ model = model.to(device)
67
+ text = input('User:')
68
+ while True:
69
+ text = '<s>{}</s></s>'.format(text)
70
+ input_ids = tokenizer(text, return_tensors="pt").input_ids
71
+ input_ids = input_ids.to(device)
72
+ outputs = model.generate(input_ids, max_new_tokens=200, do_sample=True, top_p=0.85, temperature=0.35,
73
+ repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id)
74
+ rets = tokenizer.batch_decode(outputs)
75
+ output = rets[0].strip().replace(text, "").replace('</s>', "")
76
+ print("LLM:{}".format(output))
77
+ text = input('User:')
78
+
79
+
80
+ if __name__ == '__main__':
81
+ main()
examples/exercises/firefly_bloom_1b4/ds_z3_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gradient_accumulation_steps": "auto",
3
+ "gradient_clipping": "auto",
4
+ "steps_per_print": 200,
5
+ "train_batch_size": "auto",
6
+ "train_micro_batch_size_per_gpu": "auto",
7
+ "wall_clock_breakdown": false,
8
+
9
+ "optimizer": {
10
+ "type": "Adam",
11
+ "params": {
12
+ "lr": "auto",
13
+ "betas": "auto",
14
+ "eps": "auto",
15
+ "weight_decay": "auto"
16
+ }
17
+ },
18
+ "fp16": {
19
+ "enabled": "auto",
20
+ "loss_scale": 0,
21
+ "loss_scale_window": 1000,
22
+ "initial_scale_power": 16,
23
+ "hysteresis": 2,
24
+ "min_loss_scale": 1
25
+ },
26
+ "zero_optimization": {
27
+ "stage": 3,
28
+ "overlap_comm": true,
29
+ "contiguous_gradients": true,
30
+ "sub_group_size": 1e9,
31
+ "reduce_bucket_size": "auto",
32
+ "stage3_prefetch_bucket_size": "auto",
33
+ "stage3_param_persistence_threshold": "auto",
34
+ "stage3_max_live_parameters": 1e9,
35
+ "stage3_max_reuse_distance": 1e9,
36
+ "stage3_gather_16bit_weights_on_model_save": true
37
+ },
38
+ "scheduler": {
39
+ "type": "WarmupLR",
40
+ "params": {
41
+ "warmup_min_lr": "auto",
42
+ "warmup_max_lr": "auto",
43
+ "warmup_num_steps": "auto"
44
+ }
45
+ }
46
+ }
examples/exercises/firefly_bloom_1b4/run.sh ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # sh run.sh --stage -1 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
4
+ # sh run.sh --stage -1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
5
+ # sh run.sh --stage 1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
6
+ # sh run.sh --stage 2 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
7
+
8
+ # sh run.sh --stage 1 --stop_stage 1 --system_version windows --pretrained_model_name bloom-1b4-zh
9
+
10
+ # params
11
+ system_version="windows";
12
+ verbose=true;
13
+ stage=0 # start from 0 if you need to start from data preparation
14
+ stop_stage=5
15
+ pretrained_model_supplier=YeungNLP
16
+
17
+ #pretrained_model_name=bloom-396m-zh
18
+ #pretrained_model_name=bloom-820m-zh
19
+ pretrained_model_name=bloom-1b4-zh
20
+
21
+ final_model_name=final_model_name
22
+
23
+
24
+ patience=0
25
+
26
+
27
+ # parse options
28
+ while true; do
29
+ [ -z "${1:-}" ] && break; # break if there are no arguments
30
+ case "$1" in
31
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
32
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
33
+ old_value="(eval echo \\$$name)";
34
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
35
+ was_bool=true;
36
+ else
37
+ was_bool=false;
38
+ fi
39
+
40
+ # Set the variable to the right value-- the escaped quotes make it work if
41
+ # the option had spaces, like --cmd "queue.pl -sync y"
42
+ eval "${name}=\"$2\"";
43
+
44
+ # Check that Boolean-valued arguments are really Boolean.
45
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
46
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
47
+ exit 1;
48
+ fi
49
+ shift 2;
50
+ ;;
51
+
52
+ *) break;
53
+ esac
54
+ done
55
+
56
+
57
+ $verbose && echo "system_version: ${system_version}"
58
+
59
+ work_dir="$(pwd)"
60
+ file_dir="${work_dir}/file_dir"
61
+ cache_dir="${file_dir}/cache_dir"
62
+ serialization_dir="${file_dir}/serialization_dir"
63
+
64
+ data_dir="/data/tianxing/PycharmProjects/datasets/firefly_train_1_1m"
65
+ pretrained_models_dir="${work_dir}/../../../pretrained_models/huggingface/${pretrained_model_supplier}"
66
+ final_model_dir="${work_dir}/../../../trained_models/${final_model_name}";
67
+
68
+ mkdir -p "${file_dir}"
69
+ mkdir -p "${cache_dir}"
70
+ mkdir -p "${serialization_dir}"
71
+ mkdir -p "${data_dir}"
72
+ mkdir -p "${pretrained_models_dir}"
73
+ mkdir -p "${final_model_dir}"
74
+
75
+ export PYTHONPATH="${work_dir}/../../.."
76
+
77
+ if [ $system_version == "windows" ]; then
78
+ alias python3='C:/Users/tianx/PycharmProjects/virtualenv/Transformers/Scripts/python.exe'
79
+ elif [ $system_version == "centos" ]; then
80
+ # conda activate Transformers
81
+ alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
82
+ elif [ $system_version == "ubuntu" ]; then
83
+ # conda activate Transformers
84
+ alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
85
+ fi
86
+
87
+
88
+ function search_best_ckpt() {
89
+ patience="$1";
90
+
91
+ cd "${serialization_dir}" || exit 1
92
+ last_epoch=$(ls . | \
93
+ grep "checkpoint-*" | \
94
+ awk -F'[-]' '{print$2}' | \
95
+ sort -n | \
96
+ awk 'END {print}')
97
+
98
+ target_dir=
99
+ if [ -n "${last_epoch}" ]; then
100
+ target_epoch=$((last_epoch - patience))
101
+
102
+ for epoch_idx in $(ls . | grep "checkpoint-*" | awk -F'[-]' '{print$2}' | sort -nr):
103
+ do
104
+ if [ "${epoch_idx}" -le "${target_epoch}" ]; then
105
+ target_dir="checkpoint-${epoch_idx}";
106
+ break;
107
+ fi
108
+ done
109
+ fi
110
+
111
+ echo "${target_dir}"
112
+ }
113
+
114
+
115
+ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
116
+ $verbose && echo "stage -1: download data"
117
+ cd "${data_dir}" || exit 1;
118
+
119
+ firefly_train_1_1m_size=$(/bin/ls -l firefly-train-1.1M.jsonl | awk '{print $5}')
120
+ if [ ! -e firefly-train-1.1M.jsonl ] || [ "${firefly_train_1_1m_size}" != "1171119212" ]; then
121
+ # rm firefly-train-1.1M.jsonl
122
+ wget -c https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M/resolve/main/firefly-train-1.1M.jsonl
123
+ fi
124
+
125
+ fi
126
+
127
+
128
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
129
+ $verbose && echo "stage 0: download pretrained model"
130
+ cd "${work_dir}" || exit 1;
131
+ cd "${pretrained_models_dir}" || exit 1;
132
+
133
+ if [ ! -d "${pretrained_model_name}" ]; then
134
+ git clone "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/"
135
+
136
+ cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
137
+ rm -rf .git
138
+ rm -rf flax_model.msgpack
139
+ rm -rf model.safetensors
140
+ rm -rf pytorch_model.bin
141
+ rm -rf tokenizer.json
142
+
143
+ fi
144
+
145
+ cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
146
+ if [ ! -e pytorch_model.bin ]; then
147
+ wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/pytorch_model.bin"
148
+ fi
149
+
150
+ if [ ! -e tokenizer.json ]; then
151
+ wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/tokenizer.json"
152
+ fi
153
+
154
+ fi
155
+
156
+
157
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
158
+ $verbose && echo "stage 1: train model"
159
+ cd "${work_dir}" || exit 1;
160
+ target_dir=$(search_best_ckpt "${patience}");
161
+
162
+ resume_from_checkpoint=
163
+ if [ -n "${target_dir}" ]; then
164
+ resume_from_checkpoint="${serialization_dir}/${target_dir}"
165
+ echo "resume_from_checkpoint: ${resume_from_checkpoint}"
166
+ fi
167
+
168
+ python3 1.train_model.py \
169
+ --train_file "${data_dir}/firefly-train-1.1M.jsonl" \
170
+ --pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
171
+ --output_dir "${serialization_dir}" \
172
+ --cache_dir "${cache_dir}" \
173
+ --fp16 \
174
+ ${resume_from_checkpoint:+--resume_from_checkpoint $resume_from_checkpoint}
175
+
176
+ fi
177
+
178
+
179
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
180
+ $verbose && echo "stage 2: collect files"
181
+ target_dir=$(search_best_ckpt "${patience}");
182
+
183
+ cd "${work_dir}" || exit 1;
184
+
185
+ cp "${serialization_dir}/${target_dir}/pytorch_model.bin" "${final_model_dir}/pytorch_model.bin"
186
+
187
+ cp "${pretrained_models_dir}/${pretrained_model_name}/config.json" "${final_model_dir}/config.json"
188
+ cp "${pretrained_models_dir}/${pretrained_model_name}/special_tokens_map.json" "${final_model_dir}/special_tokens_map.json"
189
+ cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer_config.json" "${final_model_dir}/tokenizer_config.json"
190
+ cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer.json" "${final_model_dir}/tokenizer.json"
191
+
192
+ fi
examples/exercises/firefly_bloom_1b4/stop.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ kill -9 `ps -aef | grep 'Transformers/bin/python3' | grep -v grep | awk '{print $2}' | sed 's/\n/ /'`
4
+ kill -9 `ps -aef | grep 'run.sh' | grep -v grep | awk '{print $2}' | sed 's/\n/ /'`
main.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import gradio as gr
6
+ import torch
7
+ from transformers import BloomTokenizerFast, BloomForCausalLM
8
+
9
+ from project_settings import project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ '--trained_model_path',
16
+ default=(project_path / "trained_models/bloom-1b4-sft").as_posix(),
17
+ type=str,
18
+ )
19
+ parser.add_argument('--device', default='auto', type=str)
20
+
21
+ args = parser.parse_args()
22
+ return args
23
+
24
+
25
+ def main():
26
+ args = get_args()
27
+
28
+ if args.device == 'auto':
29
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+ else:
31
+ device = args.device
32
+
33
+ # pretrained model
34
+ tokenizer = BloomTokenizerFast.from_pretrained(args.trained_model_path)
35
+ model = BloomForCausalLM.from_pretrained(args.trained_model_path)
36
+
37
+ description = """
38
+ FireflyBloom1b4
39
+
40
+ 基于 [YeungNLP/bloom-1b4-zh](https://huggingface.co/YeungNLP/bloom-1b4-zh) 预训练模型,
41
+ 基于 [YeungNLP/firefly-train-1.1M](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) 数据集,
42
+ 训练的等同于 [YeungNLP/firefly-bloom-1b4](https://huggingface.co/YeungNLP/firefly-bloom-1b4) 的问答模型.
43
+
44
+ 训练代码是自己编写的, 在 examples 里, 总共训练了 3 个 epoch. 感觉效果还可以.
45
+
46
+ """
47
+
48
+ def fn(text: str,
49
+ max_new_tokens: int = 200,
50
+ top_p: float = 0.85,
51
+ temperature: float = 0.35,
52
+ repetition_penalty: float = 1.2
53
+ ):
54
+ print(text)
55
+ text = '<s>{}</s></s>'.format(text)
56
+ input_ids = tokenizer(text, return_tensors="pt").input_ids
57
+ input_ids = input_ids.to(device)
58
+ outputs = model.generate(input_ids,
59
+ max_new_tokens=max_new_tokens,
60
+ do_sample=True,
61
+ top_p=top_p,
62
+ temperature=temperature,
63
+ repetition_penalty=repetition_penalty,
64
+ eos_token_id=tokenizer.eos_token_id
65
+ )
66
+ rets = tokenizer.batch_decode(outputs)
67
+ output = rets[0].strip().replace(text, "").replace('</s>', "")
68
+ print(output)
69
+ return output
70
+
71
+ demo = gr.Interface(
72
+ fn=fn,
73
+ inputs=[
74
+ gr.Text(label="text"),
75
+ gr.Number(value=200, label="max_new_tokens"),
76
+ gr.Slider(minimum=0, maximum=1, value=0.85, label="top_p"),
77
+ gr.Slider(minimum=0, maximum=1, value=0.35, label="temperature"),
78
+ gr.Number(value=1.2, label="repetition_penalty"),
79
+ ],
80
+ outputs=[gr.Text(label="output")],
81
+ examples=[
82
+ [
83
+ "将下面句子翻译成现代文:\n石中央又生一树,高百余尺,条干偃阴为五色,翠叶如盘,花径尺余,色深碧,蕊深红,异香成烟,著物霏霏。",
84
+ 200, 0.85, 0.35, 1.2
85
+ ],
86
+ [
87
+ "实体识别: 1949年10月1日,人们在北京天安门广场参加开国大典。",
88
+ 200, 0.85, 0.35, 1.2
89
+ ],
90
+ [
91
+ "把这句话翻译成英文: 1949年10月1日,人们在北京天安门广场参加开国大典。",
92
+ 200, 0.85, 0.35, 1.2
93
+ ],
94
+ [
95
+ "晚上睡不着该怎么办. 请给点详细的介绍.",
96
+ 200, 0.85, 0.35, 1.2
97
+ ],
98
+ [
99
+ "将下面的句子翻译成文言文:结婚率下降, 离婚率暴增, 生育率下降, 人民焦虑迷茫, 到底是谁的错.",
100
+ 200, 0.85, 0.35, 1.2
101
+ ],
102
+ [
103
+ "对联:厌烟沿檐烟燕眼.",
104
+ 200, 0.85, 0.35, 1.2
105
+ ],
106
+ [
107
+ "写一首咏雪的古诗, 标题为 \"沁园春, 雪\".",
108
+ 200, 0.85, 0.35, 1.2
109
+ ],
110
+ ],
111
+ examples_per_page=50,
112
+ title="Firefly Bloom 1b4",
113
+ description=description,
114
+ )
115
+ demo.launch()
116
+
117
+ return
118
+
119
+
120
+ if __name__ == '__main__':
121
+ main()
project_settings.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from toolbox.os.environment import EnvironmentManager
7
+
8
+
9
+ project_path = os.path.abspath(os.path.dirname(__file__))
10
+ project_path = Path(project_path)
11
+
12
+
13
+ environment = EnvironmentManager(
14
+ path=os.path.join(project_path, 'dotenv'),
15
+ env=os.environ.get('environment', 'dev'),
16
+ )
17
+
18
+
19
+ if __name__ == '__main__':
20
+ pass
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.20.1
2
+ pydantic==1.10.12
3
+ thinc==7.4.6
4
+ spacy==2.3.9
5
+ transformers==4.30.2
6
+ numpy==1.21.4
7
+ pandas==1.2.5
8
+ tqdm==4.62.3
9
+ torch==1.13.0
10
+ datasets
11
+ python-dotenv==1.0.0
toolbox/json/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/json/misc.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable
4
+
5
+
6
+ def traverse(js, callback: Callable, *args, **kwargs):
7
+ if isinstance(js, list):
8
+ result = list()
9
+ for l in js:
10
+ l = traverse(l, callback, *args, **kwargs)
11
+ result.append(l)
12
+ return result
13
+ elif isinstance(js, tuple):
14
+ result = list()
15
+ for l in js:
16
+ l = traverse(l, callback, *args, **kwargs)
17
+ result.append(l)
18
+ return tuple(result)
19
+ elif isinstance(js, dict):
20
+ result = dict()
21
+ for k, v in js.items():
22
+ k = traverse(k, callback, *args, **kwargs)
23
+ v = traverse(v, callback, *args, **kwargs)
24
+ result[k] = v
25
+ return result
26
+ elif isinstance(js, int):
27
+ return callback(js, *args, **kwargs)
28
+ elif isinstance(js, str):
29
+ return callback(js, *args, **kwargs)
30
+ else:
31
+ return js
32
+
33
+
34
+ def demo1():
35
+ d = {
36
+ "env": "ppe",
37
+ "mysql_connect": {
38
+ "host": "$mysql_connect_host",
39
+ "port": 3306,
40
+ "user": "callbot",
41
+ "password": "NxcloudAI2021!",
42
+ "database": "callbot_ppe",
43
+ "charset": "utf8"
44
+ },
45
+ "es_connect": {
46
+ "hosts": ["10.20.251.8"],
47
+ "http_auth": ["elastic", "ElasticAI2021!"],
48
+ "port": 9200
49
+ }
50
+ }
51
+
52
+ def callback(s):
53
+ if isinstance(s, str) and s.startswith('$'):
54
+ return s[1:]
55
+ return s
56
+
57
+ result = traverse(d, callback=callback)
58
+ print(result)
59
+ return
60
+
61
+
62
+ if __name__ == '__main__':
63
+ demo1()
toolbox/os/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/os/environment.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ import os
5
+
6
+ from dotenv import load_dotenv
7
+ from dotenv.main import DotEnv
8
+
9
+ from toolbox.json.misc import traverse
10
+
11
+
12
+ class EnvironmentManager(object):
13
+ def __init__(self, path, env, override=False):
14
+ filename = os.path.join(path, '{}.env'.format(env))
15
+ self.filename = filename
16
+
17
+ load_dotenv(
18
+ dotenv_path=filename,
19
+ override=override
20
+ )
21
+
22
+ self._environ = dict()
23
+
24
+ def open_dotenv(self, filename: str = None):
25
+ filename = filename or self.filename
26
+ dotenv = DotEnv(
27
+ dotenv_path=filename,
28
+ stream=None,
29
+ verbose=False,
30
+ interpolate=False,
31
+ override=False,
32
+ encoding="utf-8",
33
+ )
34
+ result = dotenv.dict()
35
+ return result
36
+
37
+ def get(self, key, default=None, dtype=str):
38
+ result = os.environ.get(key)
39
+ if result is None:
40
+ if default is None:
41
+ result = None
42
+ else:
43
+ result = default
44
+ else:
45
+ result = dtype(result)
46
+ self._environ[key] = result
47
+ return result
48
+
49
+
50
+ _DEFAULT_DTYPE_MAP = {
51
+ 'int': int,
52
+ 'float': float,
53
+ 'str': str,
54
+ 'json.loads': json.loads
55
+ }
56
+
57
+
58
+ class JsonConfig(object):
59
+ """
60
+ 将 json 中, 形如 `$float:threshold` 的值, 处理为:
61
+ 从环境变量中查到 threshold, 再将其转换为 float 类型.
62
+ """
63
+ def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
64
+ self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
65
+ self.environment = environment or os.environ
66
+
67
+ def sanitize_by_filename(self, filename: str):
68
+ with open(filename, 'r', encoding='utf-8') as f:
69
+ js = json.load(f)
70
+
71
+ return self.sanitize_by_json(js)
72
+
73
+ def sanitize_by_json(self, js):
74
+ js = traverse(
75
+ js,
76
+ callback=self.sanitize,
77
+ environment=self.environment
78
+ )
79
+ return js
80
+
81
+ def sanitize(self, string, environment):
82
+ """支持 $ 符开始的, 环境变量配置"""
83
+ if isinstance(string, str) and string.startswith('$'):
84
+ dtype, key = string[1:].split(':')
85
+ dtype = self.dtype_map[dtype]
86
+
87
+ value = environment.get(key)
88
+ if value is None:
89
+ raise AssertionError('environment not exist. key: {}'.format(key))
90
+
91
+ value = dtype(value)
92
+ result = value
93
+ else:
94
+ result = string
95
+ return result
96
+
97
+
98
+ def demo1():
99
+ import json
100
+
101
+ from project_settings import project_path
102
+
103
+ environment = EnvironmentManager(
104
+ path=os.path.join(project_path, 'server/callbot_server/dotenv'),
105
+ env='dev',
106
+ )
107
+ init_scenes = environment.get(key='init_scenes', dtype=json.loads)
108
+ print(init_scenes)
109
+ print(environment._environ)
110
+ return
111
+
112
+
113
+ if __name__ == '__main__':
114
+ demo1()
toolbox/os/other.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inspect
3
+
4
+
5
+ def pwd():
6
+ """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
7
+ frame = inspect.stack()[1]
8
+ module = inspect.getmodule(frame[0])
9
+ return os.path.dirname(os.path.abspath(module.__file__))
trained_models/bloom-1b4-sft/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "YeungNLP/bloom-1b4-zh",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "initializer_range": 0.02,
14
+ "layer_norm_epsilon": 1e-05,
15
+ "masked_softmax_fusion": true,
16
+ "model_type": "bloom",
17
+ "n_embed": 2048,
18
+ "n_inner": null,
19
+ "n_layer": 24,
20
+ "num_attention_heads": 16,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 2,
24
+ "seq_length": 4096,
25
+ "skip_bias_add": true,
26
+ "skip_bias_add_qkv": false,
27
+ "slow_but_exact": false,
28
+ "torch_dtype": "float16",
29
+ "transformers_version": "4.20.0",
30
+ "unk_token_id": 0,
31
+ "use_cache": true,
32
+ "vocab_size": 46145
33
+ }
trained_models/bloom-1b4-sft/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3d273ca775ee59871273c92c84803369423f045987057c3152b85f92244a6d
3
+ size 5212546569
trained_models/bloom-1b4-sft/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
trained_models/bloom-1b4-sft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
trained_models/bloom-1b4-sft/tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "name_or_path": "/Users/jianxin.yang/Desktop/pretrain_models/bloom-6b4-zh",
6
+ "pad_token": "<pad>",
7
+ "padding_side": "left",
8
+ "special_tokens_map_file": null,
9
+ "tokenizer_class": "BloomTokenizer",
10
+ "unk_token": "<unk>"
11
+ }