Update README.md
Browse files
README.md
CHANGED
@@ -16,41 +16,18 @@ Same model as [flair/ner-dutch-large](https://huggingface.co/flair/ner-dutch-lar
|
|
16 |
|
17 |
|
18 |
```python
|
19 |
-
import typing
|
20 |
-
from flair.models.sequence_tagger_model import get_spans_from_bio # Optional see below code block
|
21 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
22 |
|
23 |
tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
|
24 |
ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
|
25 |
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
indices_labels = outputs.logits.argmax(dim=-1)[0].tolist()
|
33 |
-
labels = [xlm_token_classif.config.id2label[idx] for idx in indices_labels]
|
34 |
-
probabilities = [outputs.logits.softmax(dim=-1)[0].tolist()[idx][indices_labels[idx]] for idx in range(len(indices_labels))]
|
35 |
-
|
36 |
-
spans = get_spans_from_bio(labels, probabilities)
|
37 |
-
|
38 |
-
input_ids_sent = batch["input_ids"][0]
|
39 |
-
for span in spans:
|
40 |
-
entry = {
|
41 |
-
"token_start_pos": span[0][0],
|
42 |
-
"token_end_pos": span[0][-1],
|
43 |
-
"start_pos": batch.token_to_chars(span[0][0]).start,
|
44 |
-
"end_pos": batch.token_to_chars(span[0][-1]).end,
|
45 |
-
"text": tokenizer.decode(input_ids_sent[span[0][0]:span[0][-1]+1]),
|
46 |
-
"score": span[1],
|
47 |
-
"label": span[2],
|
48 |
-
}
|
49 |
-
print(entry)
|
50 |
|
51 |
# {
|
52 |
-
# "token_start_pos": 1,
|
53 |
-
# "token_end_pos": 2,
|
54 |
# "start_pos": 0,
|
55 |
# "end_pos": 17,
|
56 |
# "text": "George Washington",
|
@@ -58,76 +35,10 @@ for span in spans:
|
|
58 |
# "label": "PER"
|
59 |
# }
|
60 |
# {
|
61 |
-
# "token_start_pos": 5,
|
62 |
-
# "token_end_pos": 5,
|
63 |
# "start_pos": 28,
|
64 |
# "end_pos": 38,
|
65 |
# "text": "Washington",
|
66 |
# "score": 0.9999939203262329,
|
67 |
# "label": "LOC"
|
68 |
# }
|
69 |
-
|
70 |
-
|
71 |
-
```
|
72 |
-
|
73 |
-
|
74 |
-
If you don't want to install flair, above function is the following:
|
75 |
-
```python
|
76 |
-
def get_spans_from_bio(bioes_tags: List[str], bioes_scores=None) -> List[typing.Tuple[List[int], float, str]]:
|
77 |
-
# add a dummy "O" to close final prediction
|
78 |
-
bioes_tags.append("O")
|
79 |
-
# return complex list
|
80 |
-
found_spans = []
|
81 |
-
# internal variables
|
82 |
-
current_tag_weights: Dict[str, float] = {}
|
83 |
-
previous_tag = "O-"
|
84 |
-
current_span: List[int] = []
|
85 |
-
current_span_scores: List[float] = []
|
86 |
-
for idx, bioes_tag in enumerate(bioes_tags):
|
87 |
-
# non-set tags are OUT tags
|
88 |
-
if bioes_tag == "" or bioes_tag == "O" or bioes_tag == "_":
|
89 |
-
bioes_tag = "O-"
|
90 |
-
|
91 |
-
# anything that is not OUT is IN
|
92 |
-
in_span = bioes_tag != "O-"
|
93 |
-
|
94 |
-
# does this prediction start a new span?
|
95 |
-
starts_new_span = False
|
96 |
-
|
97 |
-
if bioes_tag[:2] in {"B-", "S-"} or (
|
98 |
-
in_span and previous_tag[2:] != bioes_tag[2:] and (bioes_tag[:2] == "I-" or previous_tag[2:] == "S-")
|
99 |
-
):
|
100 |
-
# B- and S- always start new spans
|
101 |
-
# if the predicted class changes, I- starts a new span
|
102 |
-
# if the predicted class changes and S- was previous tag, start a new span
|
103 |
-
starts_new_span = True
|
104 |
-
|
105 |
-
# if an existing span is ended (either by reaching O or starting a new span)
|
106 |
-
if (starts_new_span or not in_span) and len(current_span) > 0:
|
107 |
-
# determine score and value
|
108 |
-
span_score = sum(current_span_scores) / len(current_span_scores)
|
109 |
-
span_value = max(current_tag_weights.keys(), key=current_tag_weights.__getitem__)
|
110 |
-
|
111 |
-
# append to result list
|
112 |
-
found_spans.append((current_span, span_score, span_value))
|
113 |
-
|
114 |
-
# reset for-loop variables for new span
|
115 |
-
current_span = []
|
116 |
-
current_span_scores = []
|
117 |
-
current_tag_weights = {}
|
118 |
-
|
119 |
-
if in_span:
|
120 |
-
current_span.append(idx)
|
121 |
-
current_span_scores.append(bioes_scores[idx] if bioes_scores else 1.0)
|
122 |
-
weight = 1.1 if starts_new_span else 1.0
|
123 |
-
current_tag_weights[bioes_tag[2:]] = current_tag_weights.setdefault(bioes_tag[2:], 0.0) + weight
|
124 |
-
|
125 |
-
# remember previous tag
|
126 |
-
previous_tag = bioes_tag
|
127 |
-
|
128 |
-
return found_spans
|
129 |
-
|
130 |
```
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
16 |
|
17 |
|
18 |
```python
|
|
|
|
|
19 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
20 |
|
21 |
tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
|
22 |
ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
|
23 |
|
24 |
+
nlp = pipeline("ner", model=ner_model, tokenizer=tokenizer, grouped_entities=True)
|
25 |
+
example = "George Washington ging naar Washington"
|
26 |
|
27 |
+
ner_results = nlp(example)
|
28 |
+
print(ner_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# {
|
|
|
|
|
31 |
# "start_pos": 0,
|
32 |
# "end_pos": 17,
|
33 |
# "text": "George Washington",
|
|
|
35 |
# "label": "PER"
|
36 |
# }
|
37 |
# {
|
|
|
|
|
38 |
# "start_pos": 28,
|
39 |
# "end_pos": 38,
|
40 |
# "text": "Washington",
|
41 |
# "score": 0.9999939203262329,
|
42 |
# "label": "LOC"
|
43 |
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
```
|
|
|
|
|
|