|
from transformers import TFDistilBertModel |
|
from transformers import DistilBertConfig |
|
from transformers import AutoTokenizer |
|
from transformers import TFAutoModelForTokenClassification |
|
from transformers import create_optimizer |
|
from transformers import DataCollatorForTokenClassification |
|
import tensorflow as tf |
|
|
|
|
|
from transformers.keras_callbacks import PushToHubCallback |
|
from tensorflow.keras.callbacks import TensorBoard |
|
|
|
|
|
from datasets import load_dataset, load_metric |
|
|
|
dataset = load_dataset('conll2003') |
|
|
|
task = 'ner' |
|
model_checkpoint = 'distilbert-base-uncased' |
|
|
|
my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4) |
|
tf_model = TFDistilBertModel(my_config) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
label_all_tokens = True |
|
|
|
def tokenize_and_align_labels(examples): |
|
tokenized_inputs = tokenizer( |
|
examples["tokens"], truncation=True, is_split_into_words=True |
|
) |
|
|
|
labels = [] |
|
for i, label in enumerate(examples[f"{task}_tags"]): |
|
word_ids = tokenized_inputs.word_ids(batch_index=i) |
|
previous_word_idx = None |
|
label_ids = [] |
|
for word_idx in word_ids: |
|
|
|
|
|
if word_idx is None: |
|
label_ids.append(-100) |
|
|
|
elif word_idx != previous_word_idx: |
|
label_ids.append(label[word_idx]) |
|
|
|
|
|
else: |
|
label_ids.append(label[word_idx] if label_all_tokens else -100) |
|
previous_word_idx = word_idx |
|
|
|
labels.append(label_ids) |
|
|
|
tokenized_inputs["labels"] = labels |
|
return tokenized_inputs |
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True) |
|
|
|
print(dataset['train'][0]) |
|
print(tokenized_datasets['train']['labels'][0]) |
|
|
|
label_list = dataset['train'].features[f'{task}_tags'].feature.names |
|
id2label = {i: label for i, label in enumerate(label_list)} |
|
label2id = {label: i for i, label in enumerate(label_list)} |
|
|
|
model = TFAutoModelForTokenClassification.from_pretrained( |
|
model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id |
|
) |
|
|
|
num_train_epochs = 3 |
|
batch_size = 16 |
|
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs |
|
optimizer, lr_schedule = create_optimizer( |
|
init_lr=2e-5, |
|
num_train_steps=num_train_steps, |
|
num_warmup_steps= 0 |
|
) |
|
|
|
model.compile(optimizer = optimizer) |
|
|
|
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np') |
|
|
|
train_set = model.prepare_tf_dataset( |
|
tokenized_datasets['train'], |
|
shuffle = True, |
|
batch_size = batch_size, |
|
collate_fn = data_collator |
|
) |
|
|
|
validation_set = model.prepare_tf_dataset( |
|
tokenized_datasets['validation'], |
|
shuffle = False, |
|
batch_size = batch_size, |
|
collate_fn = data_collator |
|
) |
|
|
|
model_name = model_checkpoint.split('/')[-1] |
|
push_to_hub_model_id = f"{model_name}-finetuned-{task}" |
|
|
|
tensorboard_callback = TensorBoard(log_dir = './model/logs') |
|
|
|
push_to_hub_callback = PushToHubCallback( |
|
output_dir= "./tc_model_save", |
|
tokenizer=tokenizer, |
|
hub_model_id=push_to_hub_model_id |
|
) |
|
import numpy as np |
|
from transformers.keras_callbacks import KerasMetricCallback |
|
example = dataset["train"][4] |
|
metric = load_metric("seqeval") |
|
labels = [label_list[i] for i in example[f"{task}_tags"]] |
|
metric.compute(predictions=[labels], references=[labels]) |
|
|
|
|
|
def compute_metrics(p): |
|
predictions, labels = p |
|
predictions = np.argmax(predictions, axis=2) |
|
|
|
|
|
true_predictions = [ |
|
[label_list[p] for (p, l) in zip(prediction, label) if l != -100] |
|
for prediction, label in zip(predictions, labels) |
|
] |
|
true_labels = [ |
|
[label_list[l] for (p, l) in zip(prediction, label) if l != -100] |
|
for prediction, label in zip(predictions, labels) |
|
] |
|
|
|
results = metric.compute(predictions=true_predictions, references=true_labels) |
|
return { |
|
"precision": results["overall_precision"], |
|
"recall": results["overall_recall"], |
|
"f1": results["overall_f1"], |
|
"accuracy": results["overall_accuracy"], |
|
} |
|
|
|
|
|
metric_callback = KerasMetricCallback( |
|
metric_fn=compute_metrics, eval_dataset=validation_set |
|
) |
|
|
|
callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback) |
|
|
|
model.fit( |
|
train_set, |
|
validation_data = validation_set, |
|
epochs = num_train_epochs, |
|
callbacks = callbacks |
|
) |
|
|
|
my_config.push_to_hub('distilbert-base-uncased-finetuned-ner') |