Spaces:
Running
Running
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments | |
from datasets import Dataset | |
import pandas as pd | |
# Modell und Tokenizer laden | |
model = GPT2LMHeadModel.from_pretrained("gpt2") | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
# Daten vorbereiten | |
train_data = [ | |
{"input_text": "Wie konfiguriere ich den Sprachassistenten?", "output_text": "Um den Sprachassistenten zu konfigurieren, gehen Sie zu den Einstellungen..."}, | |
# Weitere Trainingsdaten hinzufügen | |
] | |
# Erstellen eines Dataset-Objekts | |
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data)) | |
# Daten tokenisieren | |
def tokenize_function(examples): | |
inputs = [example['input_text'] for example in examples] | |
outputs = [example['output_text'] for example in examples] | |
model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128) | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer(outputs, padding="max_length", truncation=True, max_length=128) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True) | |
# Trainingsparameter einstellen | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=3, | |
per_device_train_batch_size=4, | |
save_steps=10_000, | |
save_total_limit=2, | |
) | |
# Trainer initialisieren | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train_dataset, | |
) | |
# Training starten | |
trainer.train() |