from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig from datasets import load_dataset, DatasetDict # 加载数据集 ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train") ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation") # 数据集字典 raw_datasets = DatasetDict( { # 训练集 # "train": ds_train, # .shuffle().select(range(50000)), "train": ds_train.shuffle().select(range(10)), # 验证集 # "valid": ds_valid, # .shuffle().select(range(500)) "valid": ds_valid.shuffle().select(range(1)) } ) context_length = 128 tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") outputs = tokenizer( # 从训练集数据集中选择前两个样本的"content"字段 raw_datasets["train"][:2]["content"], # 截断操作,如果文本长度超过max_length,则截断到指定的最大长度 truncation=True, # 128 max_length=context_length, # 表示如果文本长度超过了max_length,则返回超出部分的标记 return_overflowing_tokens=True, # 表示返回每个样本处理后的标记序列的长度 return_length=True, ) print(f"Input IDs length: {len(outputs['input_ids'])}") print(f"Input chunk lengths: {(outputs['length'])}") print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") def tokenize(element): outputs = tokenizer( element["content"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True, ) input_batch = [] for length, input_ids in zip(outputs["length"], outputs["input_ids"]): if length == context_length: input_batch.append(input_ids) return {"input_ids": input_batch} tokenized_datasets = raw_datasets.map( tokenize, batched=True, remove_columns=raw_datasets["train"].column_names ) print(tokenized_datasets) # 创建一个GPT-2语言模型的配置(config)对象 config = AutoConfig.from_pretrained( "gpt2", vocab_size=len(tokenizer), n_ctx=context_length, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, ) # 初始化模型 model = GPT2LMHeadModel(config) # 参数数量 model_size = sum(t.numel() for t in model.parameters()) print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters") # 将分词器(tokenizer)的填充标记(pad token)设置为结束标记(eos token) # 这将确保在数据收集过程中,将结束标记用作填充标记,以便对不同长度的序列进行批处理。 tokenizer.pad_token = tokenizer.eos_token # 用于语言建模任务的数据收集器对象 data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) for key in out: print(f"{key} shape: {out[key].shape}") from transformers import Trainer, TrainingArguments args = TrainingArguments( output_dir="codeparrot-ds", per_device_train_batch_size=32, per_device_eval_batch_size=32, evaluation_strategy="steps", eval_steps=5_000, logging_steps=5_000, gradient_accumulation_steps=8, num_train_epochs=1, weight_decay=0.1, warmup_steps=1_000, lr_scheduler_type="cosine", learning_rate=5e-4, save_steps=5_000, fp16=False, push_to_hub=False, ) trainer = Trainer( model=model, tokenizer=tokenizer, args=args, data_collator=data_collator, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], ) print(trainer) trainer.train()