akaashp15
/

distilbert-base-uncased-finetuned-ner

+from transformers import TFDistilBertModel
+from transformers import DistilBertConfig
+from transformers import AutoTokenizer
+from transformers import TFAutoModelForTokenClassification
+from transformers import create_optimizer
+from transformers import DataCollatorForTokenClassification
+import tensorflow as tf
+from transformers.keras_callbacks import PushToHubCallback
+from tensorflow.keras.callbacks import TensorBoard
+from datasets import load_dataset, load_metric
+dataset = load_dataset('conll2003')
+task = 'ner'
+model_checkpoint = 'distilbert-base-uncased'
+my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation = 'relu', attention_dropout = 0.4)
+tf_model = TFDistilBertModel(my_config)
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+label_all_tokens = True
+def tokenize_and_align_labels(examples):
+    tokenized_inputs = tokenizer(
+        examples["tokens"], truncation=True, is_split_into_words=True
+    )
+    labels = []
+    for i, label in enumerate(examples[f"{task}_tags"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                label_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                label_ids.append(label[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+tokenized_datasets = dataset.map(tokenize_and_align_labels, batched = True)
+print(dataset['train'][0])
+print(tokenized_datasets['train']['labels'][0])
+label_list = dataset['train'].features[f'{task}_tags'].feature.names
+id2label = {i: label for i, label in enumerate(label_list)}
+label2id = {label: i for i, label in enumerate(label_list)}
+model = TFAutoModelForTokenClassification.from_pretrained(
+    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
+)
+num_train_epochs = 3
+batch_size = 16
+num_train_steps = (len(tokenized_datasets['train']) // batch_size) * num_train_epochs
+optimizer, lr_schedule = create_optimizer(
+    init_lr=2e-5,
+    num_train_steps=num_train_steps,
+    num_warmup_steps= 0
+)
+model.compile(optimizer = optimizer)
+data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='np')
+train_set = model.prepare_tf_dataset(
+    tokenized_datasets['train'],
+    shuffle = True,
+    batch_size = batch_size,
+    collate_fn = data_collator
+)
+validation_set = model.prepare_tf_dataset(
+    tokenized_datasets['validation'],
+    shuffle = False,
+    batch_size = batch_size,
+    collate_fn = data_collator
+)
+model_name = model_checkpoint.split('/')[-1]
+push_to_hub_model_id = f"{model_name}-finetuned-{task}"
+tensorboard_callback = TensorBoard(log_dir = './model/logs')
+push_to_hub_callback = PushToHubCallback(
+    output_dir= "./tc_model_save",
+    tokenizer=tokenizer,
+    hub_model_id=push_to_hub_model_id
+)
+import numpy as np
+from transformers.keras_callbacks import KerasMetricCallback
+example = dataset["train"][4]
+metric = load_metric("seqeval")
+labels = [label_list[i] for i in example[f"{task}_tags"]]
+metric.compute(predictions=[labels], references=[labels])
+def compute_metrics(p):
+    predictions, labels = p
+    predictions = np.argmax(predictions, axis=2)
+    # Remove ignored index (special tokens)
+    true_predictions = [
+        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    true_labels = [
+        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    results = metric.compute(predictions=true_predictions, references=true_labels)
+    return {
+        "precision": results["overall_precision"],
+        "recall": results["overall_recall"],
+        "f1": results["overall_f1"],
+        "accuracy": results["overall_accuracy"],
+    }
+metric_callback = KerasMetricCallback(
+    metric_fn=compute_metrics, eval_dataset=validation_set
+)
+callbacks = (metric_callback, tensorboard_callback, push_to_hub_callback)
+model.fit(
+    train_set,
+    validation_data = validation_set,
+    epochs = num_train_epochs,
+    callbacks = callbacks
+)
+my_config.push_to_hub('distilbert-base-uncased-finetuned-ner')