File size: 3,780 Bytes
82af3ef 811dc07 82af3ef 0f670ee adf689b b93da0f 6abf77b b93da0f 6abf77b b93da0f adf689b 6abf77b adf689b 6abf77b adf689b 6abf77b adf689b 6abf77b adf689b 140806b adf689b 321eef0 8285da4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
---
inference: false
license: cc-by-4.0
datasets:
- wikiann
language:
- bg
metrics:
- accuracy
---
# 🇧🇬 BERT - Bulgarian Named Entity Recognition
The model [rmihaylov/bert-base-bg](https://huggingface.co/rmihaylov/bert-base-bg) fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann).
## Usage
Import the libraries:
```python
from typing import List, Dict
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
```
Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
```python
def predict(
text: str,
model: torch.nn.Module,
tokenizer: AutoTokenizer,
labels_tags={
0: "O",
1: "B-PER", 2: "I-PER",
3: "B-ORG", 4: "I-ORG",
5: "B-LOC", 6: "I-LOC"
}) -> List[Dict[str, str]]:
tokens_data = tokenizer(text)
tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
words = subwords_to_words(tokens)
input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0)
attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0)
out = model(input_ids, attention_mask=attention_mask).logits
out = out.argmax(-1).squeeze(0).tolist()
prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out]
return merge_words_and_predictions(words, prediction)
def subwords_to_words(tokens: List[str]) -> List[str]:
out_tokens = []
curr_token = ""
tags = []
for token in tokens:
if token == "[SEP]":
curr_token = curr_token.replace("▁", "")
out_tokens.append(curr_token)
out_tokens.append("[SEP]")
break
if "▁" in token and curr_token == "":
curr_token += token
elif "▁" in token and curr_token != "":
curr_token = curr_token.replace("▁", "")
out_tokens.append(curr_token)
curr_token = ""
curr_token += token
elif "▁" not in token:
curr_token += token
return out_tokens
def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]:
result = []
curr_word = []
for i, (word, entity) in enumerate(zip(words[1:], entities[1:])):
if "B-" in entity:
if curr_word:
curr_word = " ".join(curr_word)
result.append({
"word": curr_word,
"entity_group": entities[i][2:]
})
curr_word = [word]
else:
curr_word.append(word)
if "I-" in entity:
curr_word.append(word)
if "O" == entity:
if curr_word:
curr_word = " ".join(curr_word)
result.append({
"word": curr_word,
"entity_group": entities[i][2:]
})
curr_word = []
return result
```
Then, you should initialize the `AutoTokenizer` and `AutoModelForTokenClassification` objects:
```python
MODEL_ID = "auhide/bert-bg-ner"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
```
Finally, you can call the `predict()` method from above like that:
```python
text = "Барух Спиноза е роден в Амстердам"
print(f"Input: {text}")
print("NERs:", predict(text, model=model, tokenizer=tokenizer))
```
```sh
Input: Барух Спиноза е роден в Амстердам
NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}]
```
Note: There are three types of entities - `PER`, `ORG`, `LOC`. |