Akin Uman commited on
Commit
b20aeb2
·
1 Parent(s): d23112b

Add trainer

Browse files
Files changed (1) hide show
  1. train.py +36 -0
train.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
2
+ from datasets import load_dataset
3
+
4
+ model_name = "mistralai/Mistral-7B-v0.1"
5
+ model = AutoModelForCausalLM.from_pretrained(model_name)
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+
8
+ # Load Turkish legal dataset
9
+ dataset = load_dataset("Renicames/turkish-law-chatbot")
10
+
11
+ # Preprocessing
12
+ def preprocess_function(examples):
13
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
14
+
15
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
16
+
17
+ # Training configuration
18
+ training_args = TrainingArguments(
19
+ output_dir="./results",
20
+ evaluation_strategy="epoch",
21
+ per_device_train_batch_size=2,
22
+ learning_rate=2e-5,
23
+ num_train_epochs=3,
24
+ )
25
+
26
+ trainer = Trainer(
27
+ model=model,
28
+ args=training_args,
29
+ train_dataset=tokenized_dataset["train"],
30
+ )
31
+
32
+ trainer.train()
33
+
34
+ # Push to Hugging Face Hub
35
+ model.push_to_hub("akinuman/turkish-legal-mistral")
36
+ tokenizer.push_to_hub("akiunuman/turkish-legal-mistral")