In [1]:
!pip install danlp transformers datasets numpy flax seqeval

Collecting danlp
[?25l  Downloading https://files.pythonhosted.org/packages/62/6b/3a245c069f0a5376e565d67c2f9fb04a39e4d7e94c93c2d27e57c7bf9012/danlp-0.0.12-py3-none-any.whl (71kB)
[K     |████▋                           | 10kB 16.0MB/s eta 0:00:01[K     |█████████▏                      | 20kB 22.1MB/s eta 0:00:01[K     |█████████████▉                  | 30kB 25.0MB/s eta 0:00:01[K     |██████████████████▍             | 40kB 27.0MB/s eta 0:00:01[K     |███████████████████████         | 51kB 29.0MB/s eta 0:00:01[K     |███████████████████████████▋    | 61kB 29.9MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 9.4MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 33.3MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/8

In [2]:
from danlp.datasets import DDT
from transformers import (AutoConfig, AutoTokenizer, AutoModelForTokenClassification, 
                          DataCollatorForTokenClassification, TrainingArguments, Trainer)
from datasets import Dataset, load_metric
from functools import partial
import numpy as np

# Evaluation of Language Models for Danish

This notebook is an investigation into how much, if anything, is gained from including more languages into the training set of a language model at pretraining. We will finetune and evaluate three models:

1. `flax-community/roberta-base-danish` is a Danish RoBERTa-base model trained on the Danish part of the [mC4](https://github.com/allenai/allennlp/discussions/5265) dataset;
2. `flax-community/roberta-large-scandi` is a Scandinavian RoBERTa-base model, trained on the Danish, Norwegian and Swedish part of the [mC4](https://github.com/allenai/allennlp/discussions/5265) dataset;
3. `xlm-roberta-base` is a multilingual RoBERTa-base model trained on over 100 languages, on a filtered subset of the Common Crawl dataset.

## Named Entity Recognition

### Preparing the datasets

We start by loading the DaNE dataset for the NER task.

In [3]:
# Load the DaNE data
train, val, test = DDT().load_as_simple_ner(predefined_splits=True)

# Split docs and labels
train_docs, train_labels = train
val_docs, val_labels = val
test_docs, test_labels = test

print(f'Loaded {len(train_docs)} training samples, '
      f'{len(val_docs)} validation samples and '
      f'{len(test_docs)} test samples.')

Downloading file /tmp/tmptw7g3c2s
Loaded 4383 training samples, 564 validation samples and 565 test samples.


We next set up the labels in the dataset, converting them to a numeric representation.

In [4]:
# Get the set of all unique labels in the dataset
unique_labels = list({lbl for lbl_list in train_labels for lbl in lbl_list})

# Set up a numeric representation of the labels
label2id = {unique_labels[id]: id for id in range(len(unique_labels))}
id2label = {id: unique_labels[id] for id in range(len(unique_labels))}

print(f'There are {len(unique_labels)} unique labels in the dataset:')
print(unique_labels)

There are 9 unique labels in the dataset:
['B-PER', 'I-PER', 'O', 'I-LOC', 'B-ORG', 'B-MISC', 'I-MISC', 'B-LOC', 'I-ORG']


### Setting up the models

Next, we load the tokenisers and the models that we want to compare.

In [5]:
def prepare_model(name: str) -> dict:    
    config = AutoConfig.from_pretrained(name, 
                                        num_labels=len(unique_labels),
                                        label2id=label2id,
                                        id2label=id2label,
                                        finetuning_task='ner')
    
    tokenizer = AutoTokenizer.from_pretrained(name, 
                                              use_fast=True,
                                              add_prefix_space=True)
    
    try:
        model = AutoModelForTokenClassification.from_pretrained(name,
                                                                config=config)
    except OSError:
        model = AutoModelForTokenClassification.from_pretrained(name,
                                                                config=config,
                                                                from_flax=True)
    
    return dict(name=name, model=model, tokenizer=tokenizer)

### Setting up tokenisation of the datasets

We start by converting the datasets into the HuggingFace format.

In [6]:
train_dataset = Dataset.from_dict(dict(docs=train_docs, orig_labels=train_labels))
val_dataset = Dataset.from_dict(dict(docs=val_docs, orig_labels=val_labels))
test_dataset = Dataset.from_dict(dict(docs=test_docs, orig_labels=test_labels))

Next, we define a function which tokenises the dataset as well as aligning it with the labels in the dataset.

In [7]:
def tokenize_and_align_labels(examples: dict, tokenizer) -> dict:
    '''Tokenize all texts and align the labels with them'''
    tokenized_inputs = tokenizer(
        examples['docs'],
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples['orig_labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)#label2id[label[word_idx]])
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def tokenize_dataset(dataset: Dataset, tokenizer) -> Dataset:
    return dataset.map(partial(tokenize_and_align_labels, tokenizer=tokenizer),
                       batched=True,
                       num_proc=4,
                       desc="Running tokenizer on dataset")

Just to see that it worked, let's have a look at a tokenized dataset.

In [8]:
tokenizer = AutoTokenizer.from_pretrained('flax-community/roberta-base-danish', 
                                          use_fast=True,
                                          add_prefix_space=True)
tokenized_train = tokenize_dataset(train_dataset, tokenizer)
print(f'Sample document:')
print(list(zip(tokenized_train[0]["docs"], tokenized_train[0]["orig_labels"])))
print()
print(f'Tokenized document:')
print(list(zip([tokenizer.decode(tok).strip() for tok in tokenized_train[0]["input_ids"]], 
               [id2label[id] for id in tokenized_train[0]["labels"] if id != -100])))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=618.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1388356.0, style=ProgressStyle(descript…


  

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #0', max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #1', max=2.0, style=Progress…

  

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #2', max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #3', max=2.0, style=Progress…





Sample document:
[('På', 'O'), ('fredag', 'O'), ('har', 'O'), ('SID', 'B-ORG'), ('inviteret', 'O'), ('til', 'O'), ('reception', 'O'), ('i', 'O'), ('SID-huset', 'B-LOC'), ('i', 'O'), ('anledning', 'O'), ('af', 'O'), ('at', 'O'), ('formanden', 'O'), ('Kjeld', 'B-PER'), ('Christensen', 'I-PER'), ('går', 'O'), ('ind', 'O'), ('i', 'O'), ('de', 'O'), ('glade', 'O'), ('tressere', 'O'), ('.', 'O')]

Tokenized document:
[('På', 'O'), ('fredag', 'O'), ('har', 'O'), ('SID', 'B-ORG'), ('inviteret', 'O'), ('til', 'O'), ('reception', 'O'), ('i', 'O'), ('SID', 'B-LOC'), ('-', 'O'), ('huset', 'O'), ('i', 'O'), ('anledning', 'O'), ('af', 'O'), ('at', 'B-PER'), ('formanden', 'I-PER'), ('Kjeld', 'O'), ('Christensen', 'O'), ('går', 'O'), ('ind', 'O'), ('i', 'O'), ('de', 'O'), ('glade', 'O')]


### Finetuning the models

We now set up the actual finetuning of the models. We will be employing the `Trainer` class from the `transformers` library, and the following `compute_metrics` helper function is used during training to compute the metrics that we are interested in.

In [9]:
# Initialise metric
metric = load_metric("seqeval")

def compute_metrics(p):
    '''Helper function for computing metrics'''
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return dict(precision=results["overall_precision"],
                recall=results["overall_recall"],
                f1=results["overall_f1"],
                accuracy=results["overall_accuracy"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2482.0, style=ProgressStyle(description…




The following script then tokenises the dataset using the specified tokeniser, and starts finetuning on the DaNE dataset.

In [10]:
def finetune(model_name: str, 
             epochs: int = 10, 
             lr: float = 5e-5, 
             batch_size: int = 32,
             save: bool = True):
    '''Finetune a transformer model for NER on the DaNE dataset'''

    # Fetch the model and tokenizer
    model_dict = prepare_model(model_name)
    
    # Tokenize the datasets
    tokenized_train = tokenize_dataset(train_dataset, model_dict['tokenizer'])
    tokenized_val = tokenize_dataset(val_dataset, model_dict['tokenizer'])
    tokenized_test = tokenize_dataset(test_dataset, model_dict['tokenizer'])
    
    # Initialise the data collator
    data_collator = DataCollatorForTokenClassification(model_dict['tokenizer'])
    
    # Initialise training arguments
    training_args = TrainingArguments(output_dir=f'../models/{model_dict["name"]}-ner-dane',
                                      evaluation_strategy='epoch',
                                      logging_strategy='epoch',
                                      save_strategy='epoch' if save else 'no',
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      gradient_accumulation_steps=1,
                                      learning_rate=lr,
                                      num_train_epochs=epochs,
                                      warmup_steps=50,
                                      report_to='all',
                                      load_best_model_at_end=True)
    
    # Initialise Trainer
    trainer = Trainer(model=model_dict['model'],
                      args=training_args,
                      train_dataset=tokenized_train,
                      eval_dataset=tokenized_val,
                      tokenizer=model_dict['tokenizer'],
                      data_collator=data_collator,
                      compute_metrics=compute_metrics)
    
    # Finetune the model
    train_result = trainer.train()
    
    # Log training metrics and save the state
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
    # Log validation metrics
    metrics = trainer.evaluate()
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    
    # Log test metrics
    predictions, labels, metrics = trainer.predict(test_dataset, metric_key_prefix="predict")
    predictions = np.argmax(predictions, axis=-1)
    trainer.log_metrics("test", metrics)
    trainer.save_metrics("test", metrics)

In [15]:
model_names = dict(danish='flax-community/roberta-base-danish',
                   scandi='flax-community/roberta-large-scandi',#'Maltehb/roberta-base-scandinavian',
                   multi='xlm-roberta-base',
                   multilarge='xlm-roberta-large',
                   botxo='Maltehb/danish-bert-botxo')

In [13]:
finetune(model_names['multilarge'], epochs=25, lr=5e-5, batch_size=32, save=False)

404 Client Error: Not Found for url: https://huggingface.co/flax-community/roberta-large-scandi/resolve/main/pytorch_model.bin


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498796983.0, style=ProgressStyle(descri…




  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
Some weights of the Flax model were not used when initializing the PyTorch model RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from a Flax model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from a Flax model that you expect to be exactly identical (e.g. initializing a BertForSequenceClassification model from a FlaxBertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the Flax model and are newly initialized: ['classifier.weight', 'classifier.bias', 'roberta.embeddings.position_ids']
You should probably TRAIN this model on a

 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #0', max=2.0, style=Progress…

  

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #1', max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #2', max=2.0, style=Progress…

 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #3', max=2.0, style=Progress…





 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #0', max=1.0, style=Progress…


  

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #1', max=1.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #2', max=1.0, style=Progress…

 


HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #3', max=1.0, style=Progress…



 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #0', max=1.0, style=Progress…

 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #1', max=1.0, style=Progress…


 
 

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #2', max=1.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset #3', max=1.0, style=Progress…





The following columns in the training set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: docs, orig_labels.
***** Running training *****
  Num examples = 4383
  Num Epochs = 25
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3425


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4765,0.181083,0.408898,0.402083,0.405462,0.956252
2,0.1384,0.076793,0.68972,0.76875,0.727094,0.979965
3,0.0759,0.061824,0.748062,0.804167,0.7751,0.983062
4,0.0502,0.058904,0.773694,0.833333,0.802407,0.985192
5,0.0345,0.05544,0.814961,0.8625,0.838057,0.986837
6,0.0252,0.056832,0.803502,0.860417,0.830986,0.986643
7,0.0183,0.058509,0.804,0.8375,0.820408,0.986643


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: docs, orig_labels.
***** Running Evaluation *****
  Num examples = 564
  Batch size = 32
Saving model checkpoint to ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-137
Configuration saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-137/config.json
Model weights saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-137/pytorch_model.bin
tokenizer config file saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-137/tokenizer_config.json
Special tokens file saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-137/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: docs, orig_labels.
***** Running Evaluation *****
  Num ex

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4765,0.181083,0.408898,0.402083,0.405462,0.956252
2,0.1384,0.076793,0.68972,0.76875,0.727094,0.979965
3,0.0759,0.061824,0.748062,0.804167,0.7751,0.983062
4,0.0502,0.058904,0.773694,0.833333,0.802407,0.985192
5,0.0345,0.05544,0.814961,0.8625,0.838057,0.986837
6,0.0252,0.056832,0.803502,0.860417,0.830986,0.986643
7,0.0183,0.058509,0.804,0.8375,0.820408,0.986643
8,0.0133,0.063613,0.832347,0.879167,0.855117,0.988289
9,0.0112,0.065774,0.818182,0.88125,0.848546,0.987224
10,0.0085,0.062434,0.853119,0.883333,0.867963,0.989063


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: docs, orig_labels.
***** Running Evaluation *****
  Num examples = 564
  Batch size = 32
Saving model checkpoint to ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-1096
Configuration saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-1096/config.json
Model weights saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-1096/pytorch_model.bin
tokenizer config file saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-1096/tokenizer_config.json
Special tokens file saved in ../models/flax-community/roberta-large-scandi-ner-dane/checkpoint-1096/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: docs, orig_labels.
***** Running Evaluation *****
  N

RuntimeError: ignored