Spaces:
Runtime error
Runtime error
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig | |
from datasets import load_dataset, DatasetDict | |
# 加载数据集 | |
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train") | |
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation") | |
# 数据集字典 | |
raw_datasets = DatasetDict( | |
{ | |
# 训练集 | |
# "train": ds_train, # .shuffle().select(range(50000)), | |
"train": ds_train.shuffle().select(range(10)), | |
# 验证集 | |
# "valid": ds_valid, # .shuffle().select(range(500)) | |
"valid": ds_valid.shuffle().select(range(1)) | |
} | |
) | |
context_length = 128 | |
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") | |
outputs = tokenizer( | |
# 从训练集数据集中选择前两个样本的"content"字段 | |
raw_datasets["train"][:2]["content"], | |
# 截断操作,如果文本长度超过max_length,则截断到指定的最大长度 | |
truncation=True, | |
# 128 | |
max_length=context_length, | |
# 表示如果文本长度超过了max_length,则返回超出部分的标记 | |
return_overflowing_tokens=True, | |
# 表示返回每个样本处理后的标记序列的长度 | |
return_length=True, | |
) | |
print(f"Input IDs length: {len(outputs['input_ids'])}") | |
print(f"Input chunk lengths: {(outputs['length'])}") | |
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") | |
def tokenize(element): | |
outputs = tokenizer( | |
element["content"], | |
truncation=True, | |
max_length=context_length, | |
return_overflowing_tokens=True, | |
return_length=True, | |
) | |
input_batch = [] | |
for length, input_ids in zip(outputs["length"], outputs["input_ids"]): | |
if length == context_length: | |
input_batch.append(input_ids) | |
return {"input_ids": input_batch} | |
tokenized_datasets = raw_datasets.map( | |
tokenize, batched=True, remove_columns=raw_datasets["train"].column_names | |
) | |
print(tokenized_datasets) | |
# 创建一个GPT-2语言模型的配置(config)对象 | |
config = AutoConfig.from_pretrained( | |
"gpt2", | |
vocab_size=len(tokenizer), | |
n_ctx=context_length, | |
bos_token_id=tokenizer.bos_token_id, | |
eos_token_id=tokenizer.eos_token_id, | |
) | |
# 初始化模型 | |
model = GPT2LMHeadModel(config) | |
# 参数数量 | |
model_size = sum(t.numel() for t in model.parameters()) | |
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters") | |
# 将分词器(tokenizer)的填充标记(pad token)设置为结束标记(eos token) | |
# 这将确保在数据收集过程中,将结束标记用作填充标记,以便对不同长度的序列进行批处理。 | |
tokenizer.pad_token = tokenizer.eos_token | |
# 用于语言建模任务的数据收集器对象 | |
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) | |
for key in out: | |
print(f"{key} shape: {out[key].shape}") | |
from transformers import Trainer, TrainingArguments | |
args = TrainingArguments( | |
output_dir="codeparrot-ds", | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=32, | |
evaluation_strategy="steps", | |
eval_steps=5_000, | |
logging_steps=5_000, | |
gradient_accumulation_steps=8, | |
num_train_epochs=1, | |
weight_decay=0.1, | |
warmup_steps=1_000, | |
lr_scheduler_type="cosine", | |
learning_rate=5e-4, | |
save_steps=5_000, | |
fp16=False, | |
push_to_hub=False, | |
) | |
trainer = Trainer( | |
model=model, | |
tokenizer=tokenizer, | |
args=args, | |
data_collator=data_collator, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["valid"], | |
) | |
print(trainer) | |
trainer.train() |