aniketnikam06 commited on
Commit
ee3e9cf
1 Parent(s): bcd752a

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +28 -0
main.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
5
+ model = GPT2LMHeadModel.from_pretrained("gpt2")
6
+
7
+ dataset = load_dataset('i2ebuddy/website_data', split='train')
8
+ dataset = dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)
9
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
10
+
11
+ training_args = TrainingArguments(
12
+ output_dir="./results",
13
+ evaluation_strategy="epoch",
14
+ learning_rate=2e-5,
15
+ per_device_train_batch_size=4,
16
+ weight_decay=0.01,
17
+ save_total_limit=3,
18
+ num_train_epochs=3,
19
+ report_to="none" # do not report to any service for logging
20
+ )
21
+
22
+ trainer = Trainer(
23
+ model=model,
24
+ args=training_args,
25
+ train_dataset=dataset
26
+ )
27
+
28
+ trainer.train()