Akaash
commited on
Commit
•
405f3d6
1
Parent(s):
6367d1d
add python code for fine tuning
Browse files- transformer.py +151 -0
transformer.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import TFDistilBertModel
|
2 |
+
from transformers import DistilBertConfig
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from transformers import TFAutoModelForTokenClassification
|
5 |
+
from transformers import create_optimizer
|
6 |
+
from transformers import DataCollatorForTokenClassification
|
7 |
+
import tensorflow as tf
|
8 |
+
|
9 |
+
|
10 |
+
from transformers.keras_callbacks import PushToHubCallback
|
11 |
+
from tensorflow.keras.callbacks import TensorBoard
|
12 |
+
|
13 |
+
|
14 |
+
from datasets import load_dataset, load_metric
|
15 |
+
|
16 |
+
dataset = load_dataset('conll2003')
|
17 |
+
|
18 |
+
task = 'ner'
|
19 |
+
model_checkpoint = 'distilbert-base-uncased'
|
20 |
+
|
21 |
+
my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
|
22 |
+
tf_model = TFDistilBertModel(my_config)
|
23 |
+
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
25 |
+
label_all_tokens = True
|
26 |
+
|
27 |
+
def tokenize_and_align_labels(examples):
|
28 |
+
tokenized_inputs = tokenizer(
|
29 |
+
examples["tokens"], truncation=True, is_split_into_words=True
|
30 |
+
)
|
31 |
+
|
32 |
+
labels = []
|
33 |
+
for i, label in enumerate(examples[f"{task}_tags"]):
|
34 |
+
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
35 |
+
previous_word_idx = None
|
36 |
+
label_ids = []
|
37 |
+
for word_idx in word_ids:
|
38 |
+
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
|
39 |
+
# ignored in the loss function.
|
40 |
+
if word_idx is None:
|
41 |
+
label_ids.append(-100)
|
42 |
+
# We set the label for the first token of each word.
|
43 |
+
elif word_idx != previous_word_idx:
|
44 |
+
label_ids.append(label[word_idx])
|
45 |
+
# For the other tokens in a word, we set the label to either the current label or -100, depending on
|
46 |
+
# the label_all_tokens flag.
|
47 |
+
else:
|
48 |
+
label_ids.append(label[word_idx] if label_all_tokens else -100)
|
49 |
+
previous_word_idx = word_idx
|
50 |
+
|
51 |
+
labels.append(label_ids)
|
52 |
+
|
53 |
+
tokenized_inputs["labels"] = labels
|
54 |
+
return tokenized_inputs
|
55 |
+
|
56 |
+
|
57 |
+
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)
|
58 |
+
|
59 |
+
print(dataset['train'][0])
|
60 |
+
print(tokenized_datasets['train']['labels'][0])
|
61 |
+
|
62 |
+
label_list = dataset['train'].features[f'{task}_tags'].feature.names
|
63 |
+
id2label = {i: label for i, label in enumerate(label_list)}
|
64 |
+
label2id = {label: i for i, label in enumerate(label_list)}
|
65 |
+
|
66 |
+
model = TFAutoModelForTokenClassification.from_pretrained(
|
67 |
+
model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
|
68 |
+
)
|
69 |
+
|
70 |
+
num_train_epochs = 3
|
71 |
+
batch_size = 16
|
72 |
+
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
|
73 |
+
optimizer, lr_schedule = create_optimizer(
|
74 |
+
init_lr=2e-5,
|
75 |
+
num_train_steps=num_train_steps,
|
76 |
+
num_warmup_steps= 0
|
77 |
+
)
|
78 |
+
|
79 |
+
model.compile(optimizer = optimizer)
|
80 |
+
|
81 |
+
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')
|
82 |
+
|
83 |
+
train_set = model.prepare_tf_dataset(
|
84 |
+
tokenized_datasets['train'],
|
85 |
+
shuffle = True,
|
86 |
+
batch_size = batch_size,
|
87 |
+
collate_fn = data_collator
|
88 |
+
)
|
89 |
+
|
90 |
+
validation_set = model.prepare_tf_dataset(
|
91 |
+
tokenized_datasets['validation'],
|
92 |
+
shuffle = False,
|
93 |
+
batch_size = batch_size,
|
94 |
+
collate_fn = data_collator
|
95 |
+
)
|
96 |
+
|
97 |
+
model_name = model_checkpoint.split('/')[-1]
|
98 |
+
push_to_hub_model_id = f"{model_name}-finetuned-{task}"
|
99 |
+
|
100 |
+
tensorboard_callback = TensorBoard(log_dir = './model/logs')
|
101 |
+
|
102 |
+
push_to_hub_callback = PushToHubCallback(
|
103 |
+
output_dir= "./tc_model_save",
|
104 |
+
tokenizer=tokenizer,
|
105 |
+
hub_model_id=push_to_hub_model_id
|
106 |
+
)
|
107 |
+
import numpy as np
|
108 |
+
from transformers.keras_callbacks import KerasMetricCallback
|
109 |
+
example = dataset["train"][4]
|
110 |
+
metric = load_metric("seqeval")
|
111 |
+
labels = [label_list[i] for i in example[f"{task}_tags"]]
|
112 |
+
metric.compute(predictions=[labels], references=[labels])
|
113 |
+
|
114 |
+
|
115 |
+
def compute_metrics(p):
|
116 |
+
predictions, labels = p
|
117 |
+
predictions = np.argmax(predictions, axis=2)
|
118 |
+
|
119 |
+
# Remove ignored index (special tokens)
|
120 |
+
true_predictions = [
|
121 |
+
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
|
122 |
+
for prediction, label in zip(predictions, labels)
|
123 |
+
]
|
124 |
+
true_labels = [
|
125 |
+
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
|
126 |
+
for prediction, label in zip(predictions, labels)
|
127 |
+
]
|
128 |
+
|
129 |
+
results = metric.compute(predictions=true_predictions, references=true_labels)
|
130 |
+
return {
|
131 |
+
"precision": results["overall_precision"],
|
132 |
+
"recall": results["overall_recall"],
|
133 |
+
"f1": results["overall_f1"],
|
134 |
+
"accuracy": results["overall_accuracy"],
|
135 |
+
}
|
136 |
+
|
137 |
+
|
138 |
+
metric_callback = KerasMetricCallback(
|
139 |
+
metric_fn=compute_metrics, eval_dataset=validation_set
|
140 |
+
)
|
141 |
+
|
142 |
+
callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)
|
143 |
+
|
144 |
+
model.fit(
|
145 |
+
train_set,
|
146 |
+
validation_data = validation_set,
|
147 |
+
epochs = num_train_epochs,
|
148 |
+
callbacks = callbacks
|
149 |
+
)
|
150 |
+
|
151 |
+
my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')
|