Adapters
English
code
dnnsdunca commited on
Commit
7b93fce
1 Parent(s): 67978fd

Create Model_Training.py

Browse files
Files changed (1) hide show
  1. Model_Training.py +44 -0
Model_Training.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Load dataset - CodeParrot is a good example dataset
5
+ dataset = load_dataset('codeparrot/code-to-text')
6
+
7
+ # Load pre-trained model and tokenizer
8
+ model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
9
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
10
+
11
+ # Tokenize dataset
12
+ def tokenize_function(examples):
13
+ return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512)
14
+
15
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['code'])
16
+
17
+ # Training arguments
18
+ training_args = TrainingArguments(
19
+ output_dir="./results",
20
+ evaluation_strategy="epoch",
21
+ learning_rate=5e-5,
22
+ per_device_train_batch_size=4,
23
+ per_device_eval_batch_size=4,
24
+ num_train_epochs=3,
25
+ weight_decay=0.01,
26
+ push_to_hub=True,
27
+ hub_model_id='dnnsdunca/UANN',
28
+ hub_token='YOUR_HUGGINGFACE_TOKEN'
29
+ )
30
+
31
+ # Trainer
32
+ trainer = Trainer(
33
+ model=model,
34
+ args=training_args,
35
+ train_dataset=tokenized_datasets['train'],
36
+ eval_dataset=tokenized_datasets['validation'],
37
+ )
38
+
39
+ # Train model
40
+ trainer.train()
41
+
42
+ # Save the model
43
+ model.save_pretrained('./codegen_model')
44
+ tokenizer.save_pretrained('./codegen_model')