Akaash
add python code for fine tuning
405f3d6
raw
history blame
No virus
4.84 kB
from transformers import TFDistilBertModel
from transformers import DistilBertConfig
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers import DataCollatorForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard
from datasets import load_dataset, load_metric
dataset = load_dataset('conll2003')
task = 'ner'
model_checkpoint = 'distilbert-base-uncased'
my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
tf_model = TFDistilBertModel(my_config)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
labels = []
for i, label in enumerate(examples[f"{task}_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)
print(dataset['train'][0])
print(tokenized_datasets['train']['labels'][0])
label_list = dataset['train'].features[f'{task}_tags'].feature.names
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
model = TFAutoModelForTokenClassification.from_pretrained(
model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)
num_train_epochs = 3
batch_size = 16
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
init_lr=2e-5,
num_train_steps=num_train_steps,
num_warmup_steps= 0
)
model.compile(optimizer = optimizer)
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')
train_set = model.prepare_tf_dataset(
tokenized_datasets['train'],
shuffle = True,
batch_size = batch_size,
collate_fn = data_collator
)
validation_set = model.prepare_tf_dataset(
tokenized_datasets['validation'],
shuffle = False,
batch_size = batch_size,
collate_fn = data_collator
)
model_name = model_checkpoint.split('/')[-1]
push_to_hub_model_id = f"{model_name}-finetuned-{task}"
tensorboard_callback = TensorBoard(log_dir = './model/logs')
push_to_hub_callback = PushToHubCallback(
output_dir= "./tc_model_save",
tokenizer=tokenizer,
hub_model_id=push_to_hub_model_id
)
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback
example = dataset["train"][4]
metric = load_metric("seqeval")
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
metric_callback = KerasMetricCallback(
metric_fn=compute_metrics, eval_dataset=validation_set
)
callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)
model.fit(
train_set,
validation_data = validation_set,
epochs = num_train_epochs,
callbacks = callbacks
)
my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')