IoannisTr commited on
Commit
bf1e802
1 Parent(s): 6f687ac

Upload FinBERT_training.py

Browse files
Files changed (1) hide show
  1. FinBERT_training.py +82 -0
FinBERT_training.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+ os.environ['WANDB_DISABLED'] = "true"
4
+ import pandas as pd
5
+ from sklearn.preprocessing import LabelEncoder
6
+ from sklearn.model_selection import train_test_split
7
+ from transformers import (
8
+ AutoTokenizer,
9
+ DataCollatorWithPadding,
10
+ TrainingArguments,
11
+ Trainer,
12
+ AutoModelForSequenceClassification
13
+ )
14
+ from datasets import Dataset
15
+
16
+ #######################################
17
+ ########## FinBERT training ###########
18
+ #######################################
19
+
20
+ class args:
21
+ model = 'ProsusAI/finbert'
22
+
23
+ df = pd.read_csv('all-data.csv',
24
+ names = ['labels','messages'],
25
+ encoding='ISO-8859-1')
26
+
27
+ df = df[['messages', 'labels']]
28
+
29
+ le = LabelEncoder()
30
+ df['labels'] = le.fit_transform(df['labels'])
31
+
32
+ X, y = df['messages'].values, df['labels'].values
33
+
34
+ xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1)
35
+ xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2)
36
+
37
+ train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain})
38
+ valid_dataset_raw = Dataset.from_dict({'text':xvalid, 'labels':yvalid})
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
41
+
42
+ def tokenize_fn(examples):
43
+ return tokenizer(examples['text'], truncation=True)
44
+
45
+ train_dataset = train_dataset_raw.map(tokenize_fn, batched=True)
46
+ valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True)
47
+
48
+ data_collator = DataCollatorWithPadding(tokenizer)
49
+
50
+ model = AutoModelForSequenceClassification.from_pretrained(args.model)
51
+
52
+ train_args = TrainingArguments(
53
+ './Finbert Trained/',
54
+ per_device_train_batch_size=16,
55
+ per_device_eval_batch_size=2*16,
56
+ num_train_epochs=5,
57
+ learning_rate=2e-5,
58
+ weight_decay=0.01,
59
+ warmup_ratio=0.1,
60
+ do_eval=True,
61
+ do_train=True,
62
+ do_predict=True,
63
+ evaluation_strategy='epoch',
64
+ save_strategy="no",
65
+ )
66
+
67
+ trainer = Trainer(
68
+ model,
69
+ train_args,
70
+ train_dataset=train_dataset,
71
+ eval_dataset=valid_dataset,
72
+ data_collator=data_collator,
73
+ tokenizer=tokenizer
74
+ )
75
+
76
+ trainer.train()
77
+
78
+ # saving the model and the weights
79
+ model.save_pretrained('fine_tuned_FinBERT')
80
+ # saving the tokenizer
81
+ tokenizer.save_pretrained("fine_tuned_FinBERT/tokenizer/")
82
+