File size: 1,524 Bytes
68ca6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Modell und Tokenizer laden
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Daten vorbereiten
train_data = [
    {"input_text": "Wie konfiguriere ich den Sprachassistenten?", "output_text": "Um den Sprachassistenten zu konfigurieren, gehen Sie zu den Einstellungen..."},
    # Weitere Trainingsdaten hinzufügen
]

# Erstellen eines Dataset-Objekts
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))

# Daten tokenisieren
def tokenize_function(examples):
    inputs = [example['input_text'] for example in examples]
    outputs = [example['output_text'] for example in examples]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

# Trainingsparameter einstellen
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer initialisieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
)

# Training starten
trainer.train()