from dataclasses import dataclass from os.path import dirname, abspath # replace '\' on windows to '/' PROJECT_ROOT: str = '/'.join(abspath(dirname(__file__)).split('\\')) if '\\' in abspath(dirname(__file__)) else abspath(dirname(__file__)) # =================================================================================== # 以下为推断的配置 @dataclass class InferConfig: max_seq_len: int = 320 # 回答的最大长度 mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8' # 全量DPO模型文件, tokenizer文件和model权重放在同一个文件夹 model_dir: str = PROJECT_ROOT + '/model_save/' # lora PDO 合并后的模型文件 # model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.best.dpo.lora_merged.bin' # this confing for api demo: api_key: str = "" host: str = '127.0.0.1' port: int = 8812 reload: bool = True workers: int = 1 log_level: str = 'info' #=================================================================================== # 以下为dpo训练配置 @dataclass class DpoConfig: max_seq_len: int = 512 + 8 # 8 for eos token sft_model_file: str = PROJECT_ROOT + '/model_save/' tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹 dpo_train_file: str = PROJECT_ROOT + '/data/my_dpo_data.json' dpo_eval_file: str = PROJECT_ROOT + '/data/my_dpo_eval.json' adapter_file: str = PROJECT_ROOT + '/data/dpo/adapter_model.safetensors' log_dir: str = PROJECT_ROOT + '/logs/' per_device_train_batch_size: int = 4 num_train_epochs: int = 4 gradient_accumulation_steps: int = 8 learning_rate: float = 1e-5 logging_first_step: bool = True logging_steps: int = 20 save_steps: int = 2000 output_dir: str = PROJECT_ROOT + '/model_save/dpo' warmup_steps: int = 1000 fp16: bool = True seed: int = 23333 beta: float = 0.1 # 以下为sft配置 @dataclass class SFTconfig: max_seq_len: int = 384 + 8 # 8 for eos token finetune_from_ckp_file = PROJECT_ROOT + '/model_save/' tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹 sft_train_file: str = PROJECT_ROOT + '/data/sft_train.json' batch_size: int = 12 num_train_epochs: int = 4 save_steps: int = 5000 gradient_accumulation_steps: int = 4 learning_rate: float = 1e-5 logging_first_step: bool = True logging_steps: int = 100 output_dir: str = PROJECT_ROOT + '/model_save/sft' warmup_steps: int = 100 fp16: bool = True seed: int = 23333 # =================================================================================== # 以下为训练的配置 @dataclass class TrainConfig: epochs: int = 8 batch_size_per_gpu: int = 16 learn_rate: float = 0.0001 # 最大 div_factor * learn_rate div_factor: int = 50 mixed_precision: str = "bf16" # 混合精度 ''no','fp16','bf16' or 'fp8' # 注意:计算梯度时相当于batch_size * gradient_accumulation_steps,说人话就是梯度累积步数>1时,等于增大n倍的batch_size gradient_accumulation_steps: int = 8 # 累积梯度更新步数 warmup_steps: int = 1024 # 模型参数预热步数,预热样本数=warmup_steps * batch_size * gradient_accumulation_steps tokenizer_dir: str = PROJECT_ROOT + '/model_save/' # tokenizer一般和model权重放在同一个文件夹 model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.{}.bin' model_config_file: str = PROJECT_ROOT + '/model_save/model_config.json' train_file: str = PROJECT_ROOT + '/data/my_train_dataset.parquet' validation_file: str = PROJECT_ROOT + '/data/my_valid_dataset.parquet' test_file: str = PROJECT_ROOT + '/data/my_test_dataset.parquet' # 从哪个模型开始微调,仅当traing 函数 is_finetune = True时生效 # 微调记得冻结某些层或者调低学习率 finetune_from_ckp_file = PROJECT_ROOT + '/model_save/chat_small_t5.best.bin' # 训练状态保存,中断后可以从此处继续训练 train_state_dir: str = PROJECT_ROOT + '/model_save/train_latest_state' output_dir: str = PROJECT_ROOT + '/model_save/pretrain' logging_steps: int = 50 save_steps: int = 10000 # dataset_cache_dir: str = PROJECT_ROOT + '/data/.cache' # trainer_log_file: str = PROJECT_ROOT + '/logs/trainer.log' keep_latest_n_ckp: int = 8 # 训练过程中,最多保留多少个分数最好的模型文件 seed: int = 23333 dataloader_buffer_size: int = 50000 max_seq_len: int = 256 # 最大句子长度,默认:256 #====================================================================================== # 以下为模型的配置 @dataclass class T5ModelConfig: d_ff: int = 3072 # 全连接层维度 d_model: int = 768 # 词向量维度 num_heads: int = 12 # 注意力头数 d_model // num_heads == d_kv d_kv: int = 64 # d_model // num_heads num_decoder_layers: int = 10 # Transformer decoder 隐藏层层数 num_layers: int = 10 # Transformer encoder 隐藏层层数