File size: 4,835 Bytes

405f3d6

from transformers import TFDistilBertModel
from transformers import DistilBertConfig
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification
import tensorflow as tf


from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard


from datasets import load_dataset, load_metric

dataset = load_dataset('conll2003')

task = 'ner'
model_checkpoint = 'distilbert-base-uncased'

my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
tf_model = TFDistilBertModel(my_config)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)

print(dataset['train'][0])
print(tokenized_datasets['train']['labels'][0])

label_list = dataset['train'].features[f'{task}_tags'].feature.names
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

num_train_epochs = 3
batch_size = 16
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps= 0
)

model.compile(optimizer = optimizer)

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')

train_set = model.prepare_tf_dataset(
    tokenized_datasets['train'],
    shuffle = True,
    batch_size = batch_size,
    collate_fn = data_collator
)

validation_set = model.prepare_tf_dataset(
    tokenized_datasets['validation'],
    shuffle = False,
    batch_size = batch_size,
    collate_fn = data_collator
)

model_name = model_checkpoint.split('/')[-1]
push_to_hub_model_id = f"{model_name}-finetuned-{task}"

tensorboard_callback = TensorBoard(log_dir = './model/logs')

push_to_hub_callback = PushToHubCallback(
    output_dir= "./tc_model_save",
    tokenizer=tokenizer,
    hub_model_id=push_to_hub_model_id
)
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
example = dataset["train"][4]
metric = load_metric("seqeval")
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)

model.fit(
    train_set,
    validation_data = validation_set,
    epochs = num_train_epochs,
    callbacks = callbacks
)

my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')