Spaces:
Runtime error
Runtime error
| from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig | |
| from datasets import load_dataset, DatasetDict | |
| # 加载数据集 | |
| ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train") | |
| ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation") | |
| # 数据集字典 | |
| raw_datasets = DatasetDict( | |
| { | |
| # 训练集 | |
| # "train": ds_train, # .shuffle().select(range(50000)), | |
| "train": ds_train.shuffle().select(range(10)), | |
| # 验证集 | |
| # "valid": ds_valid, # .shuffle().select(range(500)) | |
| "valid": ds_valid.shuffle().select(range(1)) | |
| } | |
| ) | |
| context_length = 128 | |
| tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") | |
| outputs = tokenizer( | |
| # 从训练集数据集中选择前两个样本的"content"字段 | |
| raw_datasets["train"][:2]["content"], | |
| # 截断操作,如果文本长度超过max_length,则截断到指定的最大长度 | |
| truncation=True, | |
| # 128 | |
| max_length=context_length, | |
| # 表示如果文本长度超过了max_length,则返回超出部分的标记 | |
| return_overflowing_tokens=True, | |
| # 表示返回每个样本处理后的标记序列的长度 | |
| return_length=True, | |
| ) | |
| print(f"Input IDs length: {len(outputs['input_ids'])}") | |
| print(f"Input chunk lengths: {(outputs['length'])}") | |
| print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") | |
| def tokenize(element): | |
| outputs = tokenizer( | |
| element["content"], | |
| truncation=True, | |
| max_length=context_length, | |
| return_overflowing_tokens=True, | |
| return_length=True, | |
| ) | |
| input_batch = [] | |
| for length, input_ids in zip(outputs["length"], outputs["input_ids"]): | |
| if length == context_length: | |
| input_batch.append(input_ids) | |
| return {"input_ids": input_batch} | |
| tokenized_datasets = raw_datasets.map( | |
| tokenize, batched=True, remove_columns=raw_datasets["train"].column_names | |
| ) | |
| print(tokenized_datasets) | |
| # 创建一个GPT-2语言模型的配置(config)对象 | |
| config = AutoConfig.from_pretrained( | |
| "gpt2", | |
| vocab_size=len(tokenizer), | |
| n_ctx=context_length, | |
| bos_token_id=tokenizer.bos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # 初始化模型 | |
| model = GPT2LMHeadModel(config) | |
| # 参数数量 | |
| model_size = sum(t.numel() for t in model.parameters()) | |
| print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters") | |
| # 将分词器(tokenizer)的填充标记(pad token)设置为结束标记(eos token) | |
| # 这将确保在数据收集过程中,将结束标记用作填充标记,以便对不同长度的序列进行批处理。 | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # 用于语言建模任务的数据收集器对象 | |
| data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
| out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) | |
| for key in out: | |
| print(f"{key} shape: {out[key].shape}") | |
| from transformers import Trainer, TrainingArguments | |
| args = TrainingArguments( | |
| output_dir="codeparrot-ds", | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=32, | |
| evaluation_strategy="steps", | |
| eval_steps=5_000, | |
| logging_steps=5_000, | |
| gradient_accumulation_steps=8, | |
| num_train_epochs=1, | |
| weight_decay=0.1, | |
| warmup_steps=1_000, | |
| lr_scheduler_type="cosine", | |
| learning_rate=5e-4, | |
| save_steps=5_000, | |
| fp16=False, | |
| push_to_hub=False, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| tokenizer=tokenizer, | |
| args=args, | |
| data_collator=data_collator, | |
| train_dataset=tokenized_datasets["train"], | |
| eval_dataset=tokenized_datasets["valid"], | |
| ) | |
| print(trainer) | |
| trainer.train() |