Akaash
add python code for fine tuning
405f3d6
from transformers import TFDistilBertModel
from transformers import DistilBertConfig
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard
from datasets import load_dataset, load_metric
dataset = load_dataset('conll2003')
task = 'ner'
model_checkpoint = 'distilbert-base-uncased'
my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
tf_model = TFDistilBertModel(my_config)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
labels = []
for i, label in enumerate(examples[f"{task}_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)
print(dataset['train'][0])
print(tokenized_datasets['train']['labels'][0])
label_list = dataset['train'].features[f'{task}_tags'].feature.names
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
model = TFAutoModelForTokenClassification.from_pretrained(
model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)
num_train_epochs = 3
batch_size = 16
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
init_lr=2e-5,
num_train_steps=num_train_steps,
num_warmup_steps= 0
)
model.compile(optimizer = optimizer)
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')
train_set = model.prepare_tf_dataset(
tokenized_datasets['train'],
shuffle = True,
batch_size = batch_size,
collate_fn = data_collator
)
validation_set = model.prepare_tf_dataset(
tokenized_datasets['validation'],
shuffle = False,
batch_size = batch_size,
collate_fn = data_collator
)
model_name = model_checkpoint.split('/')[-1]
push_to_hub_model_id = f"{model_name}-finetuned-{task}"
tensorboard_callback = TensorBoard(log_dir = './model/logs')
push_to_hub_callback = PushToHubCallback(
output_dir= "./tc_model_save",
tokenizer=tokenizer,
hub_model_id=push_to_hub_model_id
)
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
example = dataset["train"][4]
metric = load_metric("seqeval")
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
metric_callback = KerasMetricCallback(
metric_fn=compute_metrics, eval_dataset=validation_set
)
callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)
model.fit(
train_set,
validation_data = validation_set,
epochs = num_train_epochs,
callbacks = callbacks
)
my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')