from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments from datasets import Dataset import pandas as pd # Modell und Tokenizer laden model = GPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Daten vorbereiten train_data = [ {"input_text": "Wie konfiguriere ich den Sprachassistenten?", "output_text": "Um den Sprachassistenten zu konfigurieren, gehen Sie zu den Einstellungen..."}, # Weitere Trainingsdaten hinzufügen ] # Erstellen eines Dataset-Objekts train_dataset = Dataset.from_pandas(pd.DataFrame(train_data)) # Daten tokenisieren def tokenize_function(examples): inputs = [example['input_text'] for example in examples] outputs = [example['output_text'] for example in examples] model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128) with tokenizer.as_target_tokenizer(): labels = tokenizer(outputs, padding="max_length", truncation=True, max_length=128) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True) # Trainingsparameter einstellen training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=4, save_steps=10_000, save_total_limit=2, ) # Trainer initialisieren trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train_dataset, ) # Training starten trainer.train()