🇧🇬 BERT - Bulgarian Named Entity Recognition

The model rmihaylov/bert-base-bg fine-tuned on a Bulgarian subset of wikiann.

Usage

Import the libraries:

from typing import List, Dict

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

Firstly, you'll have to define these methods, since we are using a subword Tokenizer:

def predict(
    text: str, 
    model: torch.nn.Module, 
    tokenizer: AutoTokenizer,
    labels_tags={
        0: "O",
        1: "B-PER", 2: "I-PER", 
        3: "B-ORG", 4: "I-ORG", 
        5: "B-LOC", 6: "I-LOC"
    }) -> List[Dict[str, str]]:
    tokens_data = tokenizer(text)
    tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
    words = subwords_to_words(tokens)

    input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0)
    attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0)

    out = model(input_ids, attention_mask=attention_mask).logits
    out = out.argmax(-1).squeeze(0).tolist()

    prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out]

    return merge_words_and_predictions(words, prediction)


def subwords_to_words(tokens: List[str]) -> List[str]:
    out_tokens = []
    curr_token = ""
    tags = []
    
    for token in tokens:
        if token == "[SEP]":
            curr_token = curr_token.replace("▁", "")
            out_tokens.append(curr_token)
            out_tokens.append("[SEP]")
            break

        if "▁" in token and curr_token == "":
            curr_token += token

        elif "▁" in token and curr_token != "":
            curr_token = curr_token.replace("▁", "")
            out_tokens.append(curr_token)
            curr_token = ""
            curr_token += token

        elif "▁" not in token:
            curr_token += token

    return out_tokens


def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]:
    result = []
    curr_word = []

    for i, (word, entity) in enumerate(zip(words[1:], entities[1:])):
        if "B-" in entity:
            if curr_word:
                curr_word = " ".join(curr_word)
                result.append({
                    "word": curr_word,
                    "entity_group": entities[i][2:]
                })
                curr_word = [word]
            else:
                curr_word.append(word)

        if "I-" in entity:
            curr_word.append(word)
        
        if "O" == entity:
            if curr_word:
                curr_word = " ".join(curr_word)
                result.append({
                    "word": curr_word,
                    "entity_group": entities[i][2:]
                })
            
            curr_word = []

    return result

Then, you should initialize the AutoTokenizer and AutoModelForTokenClassification objects:

MODEL_ID = "auhide/bert-bg-ner"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

Finally, you can call the predict() method from above like that:

text = "Барух Спиноза е роден в Амстердам"
print(f"Input: {text}")
print("NERs:", predict(text, model=model, tokenizer=tokenizer))

Input: Барух Спиноза е роден в Амстердам
NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}]

Note: There are three types of entities - PER, ORG, LOC.

auhide
/

bert-bg-ner

🇧🇬 BERT - Bulgarian Named Entity Recognition

Usage

Dataset used to train auhide/bert-bg-ner