from transformers import TFDistilBertModel from transformers import DistilBertConfig from transformers import AutoTokenizer from transformers import TFAutoModelForTokenClassification from transformers import create_optimizer from transformers import DataCollatorForTokenClassification import tensorflow as tf from transformers.keras_callbacks import PushToHubCallback from tensorflow.keras.callbacks import TensorBoard from datasets import load_dataset, load_metric dataset = load_dataset('conll2003') task = 'ner' model_checkpoint = 'distilbert-base-uncased' my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4) tf_model = TFDistilBertModel(my_config) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) label_all_tokens = True def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples["tokens"], truncation=True, is_split_into_words=True ) labels = [] for i, label in enumerate(examples[f"{task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label[word_idx] if label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True) print(dataset['train'][0]) print(tokenized_datasets['train']['labels'][0]) label_list = dataset['train'].features[f'{task}_tags'].feature.names id2label = {i: label for i, label in enumerate(label_list)} label2id = {label: i for i, label in enumerate(label_list)} model = TFAutoModelForTokenClassification.from_pretrained( model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id ) num_train_epochs = 3 batch_size = 16 num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs optimizer, lr_schedule = create_optimizer( init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps= 0 ) model.compile(optimizer = optimizer) data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np') train_set = model.prepare_tf_dataset( tokenized_datasets['train'], shuffle = True, batch_size = batch_size, collate_fn = data_collator ) validation_set = model.prepare_tf_dataset( tokenized_datasets['validation'], shuffle = False, batch_size = batch_size, collate_fn = data_collator ) model_name = model_checkpoint.split('/')[-1] push_to_hub_model_id = f"{model_name}-finetuned-{task}" tensorboard_callback = TensorBoard(log_dir = './model/logs') push_to_hub_callback = PushToHubCallback( output_dir= "./tc_model_save", tokenizer=tokenizer, hub_model_id=push_to_hub_model_id ) import numpy as np from transformers.keras_callbacks import KerasMetricCallback example = dataset["train"][4] metric = load_metric("seqeval") labels = [label_list[i] for i in example[f"{task}_tags"]] metric.compute(predictions=[labels], references=[labels]) def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] true_labels = [ [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] results = metric.compute(predictions=true_predictions, references=true_labels) return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } metric_callback = KerasMetricCallback( metric_fn=compute_metrics, eval_dataset=validation_set ) callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback) model.fit( train_set, validation_data = validation_set, epochs = num_train_epochs, callbacks = callbacks ) my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')