Spaces:
Runtime error
Runtime error
File size: 5,517 Bytes
f4fac26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
from dataclasses import dataclass
from os.path import dirname, abspath
# replace '\' on windows to '/'
PROJECT_ROOT: str = '/'.join(abspath(dirname(__file__)).split('\\')) if '\\' in abspath(dirname(__file__)) else abspath(dirname(__file__))
# ===================================================================================
# 以下为推断的配置
@dataclass
class InferConfig:
max_seq_len: int = 320 # 回答的最大长度
mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8'
# 全量DPO模型文件, tokenizer文件和model权重放在同一个文件夹
model_dir: str = PROJECT_ROOT + '/model_save/'
# lora PDO 合并后的模型文件
# model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.best.dpo.lora_merged.bin'
# this confing for api demo:
api_key: str = ""
host: str = '127.0.0.1'
port: int = 8812
reload: bool = True
workers: int = 1
log_level: str = 'info'
#===================================================================================
# 以下为dpo训练配置
@dataclass
class DpoConfig:
max_seq_len: int = 512 + 8 # 8 for eos token
sft_model_file: str = PROJECT_ROOT + '/model_save/'
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
dpo_train_file: str = PROJECT_ROOT + '/data/my_dpo_data.json'
dpo_eval_file: str = PROJECT_ROOT + '/data/my_dpo_eval.json'
adapter_file: str = PROJECT_ROOT + '/data/dpo/adapter_model.safetensors'
log_dir: str = PROJECT_ROOT + '/logs/'
per_device_train_batch_size: int = 4
num_train_epochs: int = 4
gradient_accumulation_steps: int = 8
learning_rate: float = 1e-5
logging_first_step: bool = True
logging_steps: int = 20
save_steps: int = 2000
output_dir: str = PROJECT_ROOT + '/model_save/dpo'
warmup_steps: int = 1000
fp16: bool = True
seed: int = 23333
beta: float = 0.1
# 以下为sft配置
@dataclass
class SFTconfig:
max_seq_len: int = 384 + 8 # 8 for eos token
finetune_from_ckp_file = PROJECT_ROOT + '/model_save/'
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
sft_train_file: str = PROJECT_ROOT + '/data/sft_train.json'
batch_size: int = 12
num_train_epochs: int = 4
save_steps: int = 5000
gradient_accumulation_steps: int = 4
learning_rate: float = 1e-5
logging_first_step: bool = True
logging_steps: int = 100
output_dir: str = PROJECT_ROOT + '/model_save/sft'
warmup_steps: int = 100
fp16: bool = True
seed: int = 23333
# ===================================================================================
# 以下为训练的配置
@dataclass
class TrainConfig:
epochs: int = 8
batch_size_per_gpu: int = 16
learn_rate: float = 0.0001 # 最大 div_factor * learn_rate
div_factor: int = 50
mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8'
# 注意:计算梯度时相当于batch_size * gradient_accumulation_steps,说人话就是梯度累积步数>1时,等于增大n倍的batch_size
gradient_accumulation_steps: int = 8 # 累积梯度更新步数
warmup_steps: int = 1024 # 模型参数预热步数,预热样本数=warmup_steps * batch_size * gradient_accumulation_steps
tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹
model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.{}.bin'
model_config_file: str = PROJECT_ROOT + '/model_save/model_config.json'
train_file: str = PROJECT_ROOT + '/data/my_train_dataset.parquet'
validation_file: str = PROJECT_ROOT + '/data/my_valid_dataset.parquet'
test_file: str = PROJECT_ROOT + '/data/my_test_dataset.parquet'
# 从哪个模型开始微调,仅当traing 函数 is_finetune = True时生效
# 微调记得冻结某些层或者调低学习率
finetune_from_ckp_file = PROJECT_ROOT + '/model_save/chat_small_t5.best.bin'
# 训练状态保存,中断后可以从此处继续训练
train_state_dir: str = PROJECT_ROOT + '/model_save/train_latest_state'
output_dir: str = PROJECT_ROOT + '/model_save/pretrain'
logging_steps: int = 50
save_steps: int = 10000
# dataset_cache_dir: str = PROJECT_ROOT + '/data/.cache'
# trainer_log_file: str = PROJECT_ROOT + '/logs/trainer.log'
keep_latest_n_ckp: int = 8 # 训练过程中,最多保留多少个分数最好的模型文件
seed: int = 23333
dataloader_buffer_size: int = 50000
max_seq_len: int = 256 # 最大句子长度,默认:256
#======================================================================================
# 以下为模型的配置
@dataclass
class T5ModelConfig:
d_ff: int = 3072 # 全连接层维度
d_model: int = 768 # 词向量维度
num_heads: int = 12 # 注意力头数 d_model // num_heads == d_kv
d_kv: int = 64 # d_model // num_heads
num_decoder_layers: int = 10 # Transformer decoder 隐藏层层数
num_layers: int = 10 # Transformer encoder 隐藏层层数 |