fReEsPiRiT94 commited on
Commit
68ca6da
1 Parent(s): 69a3efb

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +47 -0
train.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import pandas as pd
4
+
5
+ # Modell und Tokenizer laden
6
+ model = GPT2LMHeadModel.from_pretrained("gpt2")
7
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
8
+
9
+ # Daten vorbereiten
10
+ train_data = [
11
+ {"input_text": "Wie konfiguriere ich den Sprachassistenten?", "output_text": "Um den Sprachassistenten zu konfigurieren, gehen Sie zu den Einstellungen..."},
12
+ # Weitere Trainingsdaten hinzufügen
13
+ ]
14
+
15
+ # Erstellen eines Dataset-Objekts
16
+ train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
17
+
18
+ # Daten tokenisieren
19
+ def tokenize_function(examples):
20
+ inputs = [example['input_text'] for example in examples]
21
+ outputs = [example['output_text'] for example in examples]
22
+ model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
23
+ with tokenizer.as_target_tokenizer():
24
+ labels = tokenizer(outputs, padding="max_length", truncation=True, max_length=128)
25
+ model_inputs["labels"] = labels["input_ids"]
26
+ return model_inputs
27
+
28
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
29
+
30
+ # Trainingsparameter einstellen
31
+ training_args = TrainingArguments(
32
+ output_dir='./results',
33
+ num_train_epochs=3,
34
+ per_device_train_batch_size=4,
35
+ save_steps=10_000,
36
+ save_total_limit=2,
37
+ )
38
+
39
+ # Trainer initialisieren
40
+ trainer = Trainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=tokenized_train_dataset,
44
+ )
45
+
46
+ # Training starten
47
+ trainer.train()