File size: 4,835 Bytes
405f3d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from transformers import TFDistilBertModel
from transformers import DistilBertConfig
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification
import tensorflow as tf


from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard


from datasets import load_dataset, load_metric

dataset = load_dataset('conll2003')

task = 'ner'
model_checkpoint = 'distilbert-base-uncased'

my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
tf_model = TFDistilBertModel(my_config)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)

print(dataset['train'][0])
print(tokenized_datasets['train']['labels'][0])

label_list = dataset['train'].features[f'{task}_tags'].feature.names
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

num_train_epochs = 3
batch_size = 16
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps= 0
)

model.compile(optimizer = optimizer)

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')

train_set = model.prepare_tf_dataset(
    tokenized_datasets['train'],
    shuffle = True,
    batch_size = batch_size,
    collate_fn = data_collator
)

validation_set = model.prepare_tf_dataset(
    tokenized_datasets['validation'],
    shuffle = False,
    batch_size = batch_size,
    collate_fn = data_collator
)

model_name = model_checkpoint.split('/')[-1]
push_to_hub_model_id = f"{model_name}-finetuned-{task}"

tensorboard_callback = TensorBoard(log_dir = './model/logs')

push_to_hub_callback = PushToHubCallback(
    output_dir= "./tc_model_save",
    tokenizer=tokenizer,
    hub_model_id=push_to_hub_model_id
)
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
example = dataset["train"][4]
metric = load_metric("seqeval")
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)

model.fit(
    train_set,
    validation_data = validation_set,
    epochs = num_train_epochs,
    callbacks = callbacks
)

my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')