Akaash commited on
Commit
405f3d6
1 Parent(s): 6367d1d

add python code for fine tuning

Browse files
Files changed (1) hide show
  1. transformer.py +151 -0
transformer.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TFDistilBertModel
2
+ from transformers import DistilBertConfig
3
+ from transformers import AutoTokenizer
4
+ from transformers import TFAutoModelForTokenClassification
5
+ from transformers import create_optimizer
6
+ from transformers import DataCollatorForTokenClassification
7
+ import tensorflow as tf
8
+
9
+
10
+ from transformers.keras_callbacks import PushToHubCallback
11
+ from tensorflow.keras.callbacks import TensorBoard
12
+
13
+
14
+ from datasets import load_dataset, load_metric
15
+
16
+ dataset = load_dataset('conll2003')
17
+
18
+ task = 'ner'
19
+ model_checkpoint = 'distilbert-base-uncased'
20
+
21
+ my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
22
+ tf_model = TFDistilBertModel(my_config)
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
25
+ label_all_tokens = True
26
+
27
+ def tokenize_and_align_labels(examples):
28
+ tokenized_inputs = tokenizer(
29
+ examples["tokens"], truncation=True, is_split_into_words=True
30
+ )
31
+
32
+ labels = []
33
+ for i, label in enumerate(examples[f"{task}_tags"]):
34
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
35
+ previous_word_idx = None
36
+ label_ids = []
37
+ for word_idx in word_ids:
38
+ # Special tokens have a word id that is None. We set the label to -100 so they are automatically
39
+ # ignored in the loss function.
40
+ if word_idx is None:
41
+ label_ids.append(-100)
42
+ # We set the label for the first token of each word.
43
+ elif word_idx != previous_word_idx:
44
+ label_ids.append(label[word_idx])
45
+ # For the other tokens in a word, we set the label to either the current label or -100, depending on
46
+ # the label_all_tokens flag.
47
+ else:
48
+ label_ids.append(label[word_idx] if label_all_tokens else -100)
49
+ previous_word_idx = word_idx
50
+
51
+ labels.append(label_ids)
52
+
53
+ tokenized_inputs["labels"] = labels
54
+ return tokenized_inputs
55
+
56
+
57
+ tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)
58
+
59
+ print(dataset['train'][0])
60
+ print(tokenized_datasets['train']['labels'][0])
61
+
62
+ label_list = dataset['train'].features[f'{task}_tags'].feature.names
63
+ id2label = {i: label for i, label in enumerate(label_list)}
64
+ label2id = {label: i for i, label in enumerate(label_list)}
65
+
66
+ model = TFAutoModelForTokenClassification.from_pretrained(
67
+ model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
68
+ )
69
+
70
+ num_train_epochs = 3
71
+ batch_size = 16
72
+ num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
73
+ optimizer, lr_schedule = create_optimizer(
74
+ init_lr=2e-5,
75
+ num_train_steps=num_train_steps,
76
+ num_warmup_steps= 0
77
+ )
78
+
79
+ model.compile(optimizer = optimizer)
80
+
81
+ data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')
82
+
83
+ train_set = model.prepare_tf_dataset(
84
+ tokenized_datasets['train'],
85
+ shuffle = True,
86
+ batch_size = batch_size,
87
+ collate_fn = data_collator
88
+ )
89
+
90
+ validation_set = model.prepare_tf_dataset(
91
+ tokenized_datasets['validation'],
92
+ shuffle = False,
93
+ batch_size = batch_size,
94
+ collate_fn = data_collator
95
+ )
96
+
97
+ model_name = model_checkpoint.split('/')[-1]
98
+ push_to_hub_model_id = f"{model_name}-finetuned-{task}"
99
+
100
+ tensorboard_callback = TensorBoard(log_dir = './model/logs')
101
+
102
+ push_to_hub_callback = PushToHubCallback(
103
+ output_dir= "./tc_model_save",
104
+ tokenizer=tokenizer,
105
+ hub_model_id=push_to_hub_model_id
106
+ )
107
+ import numpy as np
108
+ from transformers.keras_callbacks import KerasMetricCallback
109
+ example = dataset["train"][4]
110
+ metric = load_metric("seqeval")
111
+ labels = [label_list[i] for i in example[f"{task}_tags"]]
112
+ metric.compute(predictions=[labels], references=[labels])
113
+
114
+
115
+ def compute_metrics(p):
116
+ predictions, labels = p
117
+ predictions = np.argmax(predictions, axis=2)
118
+
119
+ # Remove ignored index (special tokens)
120
+ true_predictions = [
121
+ [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
122
+ for prediction, label in zip(predictions, labels)
123
+ ]
124
+ true_labels = [
125
+ [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
126
+ for prediction, label in zip(predictions, labels)
127
+ ]
128
+
129
+ results = metric.compute(predictions=true_predictions, references=true_labels)
130
+ return {
131
+ "precision": results["overall_precision"],
132
+ "recall": results["overall_recall"],
133
+ "f1": results["overall_f1"],
134
+ "accuracy": results["overall_accuracy"],
135
+ }
136
+
137
+
138
+ metric_callback = KerasMetricCallback(
139
+ metric_fn=compute_metrics, eval_dataset=validation_set
140
+ )
141
+
142
+ callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)
143
+
144
+ model.fit(
145
+ train_set,
146
+ validation_data = validation_set,
147
+ epochs = num_train_epochs,
148
+ callbacks = callbacks
149
+ )
150
+
151
+ my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')