EvanD
/

dutch-ner-xlm-conll2003

@@ -16,41 +16,18 @@ Same model as [flair/ner-dutch-large](https://huggingface.co/flair/ner-dutch-lar
 ```python
-import typing
-from flair.models.sequence_tagger_model import get_spans_from_bio # Optional see below code block
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
 ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
-batch = tokenizer.encode_plus("George Washington ging naar Washington", return_tensors="pt")
-batch = batch.to("cuda")
-with torch.inference_mode():
-    outputs = ner_dutch(**batch)
-indices_labels = outputs.logits.argmax(dim=-1)[0].tolist()
-labels = [xlm_token_classif.config.id2label[idx] for idx in indices_labels]
-probabilities = [outputs.logits.softmax(dim=-1)[0].tolist()[idx][indices_labels[idx]] for idx in range(len(indices_labels))]
-spans = get_spans_from_bio(labels, probabilities)
-input_ids_sent = batch["input_ids"][0]
-for span in spans:
-    entry = {
-        "token_start_pos": span[0][0],
-        "token_end_pos": span[0][-1],
-        "start_pos": batch.token_to_chars(span[0][0]).start,
-        "end_pos": batch.token_to_chars(span[0][-1]).end,
-        "text": tokenizer.decode(input_ids_sent[span[0][0]:span[0][-1]+1]),
-        "score": span[1],
-        "label": span[2],
-    }
-    print(entry)
 # {
-#     "token_start_pos": 1,
-#     "token_end_pos": 2,
 #     "start_pos": 0,
 #     "end_pos": 17,
 #     "text": "George Washington",
@@ -58,76 +35,10 @@ for span in spans:
 #     "label": "PER"
 # }
 # {
-#     "token_start_pos": 5,
-#     "token_end_pos": 5,
 #     "start_pos": 28,
 #     "end_pos": 38,
 #     "text": "Washington",
 #     "score": 0.9999939203262329,
 #     "label": "LOC"
 # }
-```
-If you don't want to install flair, above function is the following:
-```python
-def get_spans_from_bio(bioes_tags: List[str], bioes_scores=None) -> List[typing.Tuple[List[int], float, str]]:
-    # add a dummy "O" to close final prediction
-    bioes_tags.append("O")
-    # return complex list
-    found_spans = []
-    # internal variables
-    current_tag_weights: Dict[str, float] = {}
-    previous_tag = "O-"
-    current_span: List[int] = []
-    current_span_scores: List[float] = []
-    for idx, bioes_tag in enumerate(bioes_tags):
-        # non-set tags are OUT tags
-        if bioes_tag == "" or bioes_tag == "O" or bioes_tag == "_":
-            bioes_tag = "O-"
-        # anything that is not OUT is IN
-        in_span = bioes_tag != "O-"
-        # does this prediction start a new span?
-        starts_new_span = False
-        if bioes_tag[:2] in {"B-", "S-"} or (
-            in_span and previous_tag[2:] != bioes_tag[2:] and (bioes_tag[:2] == "I-" or previous_tag[2:] == "S-")
-        ):
-            # B- and S- always start new spans
-            # if the predicted class changes, I- starts a new span
-            # if the predicted class changes and S- was previous tag, start a new span
-            starts_new_span = True
-        # if an existing span is ended (either by reaching O or starting a new span)
-        if (starts_new_span or not in_span) and len(current_span) > 0:
-            # determine score and value
-            span_score = sum(current_span_scores) / len(current_span_scores)
-            span_value = max(current_tag_weights.keys(), key=current_tag_weights.__getitem__)
-            # append to result list
-            found_spans.append((current_span, span_score, span_value))
-            # reset for-loop variables for new span
-            current_span = []
-            current_span_scores = []
-            current_tag_weights = {}
-        if in_span:
-            current_span.append(idx)
-            current_span_scores.append(bioes_scores[idx] if bioes_scores else 1.0)
-            weight = 1.1 if starts_new_span else 1.0
-            current_tag_weights[bioes_tag[2:]] = current_tag_weights.setdefault(bioes_tag[2:], 0.0) + weight
-        # remember previous tag
-        previous_tag = bioes_tag
-    return found_spans
 ```

 ```python
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
 ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
+nlp = pipeline("ner", model=ner_model, tokenizer=tokenizer, grouped_entities=True)
+example = "George Washington ging naar Washington"
+ner_results = nlp(example)
+print(ner_results)
 # {
 #     "start_pos": 0,
 #     "end_pos": 17,
 #     "text": "George Washington",
 #     "label": "PER"
 # }
 # {
 #     "start_pos": 28,
 #     "end_pos": 38,
 #     "text": "Washington",
 #     "score": 0.9999939203262329,
 #     "label": "LOC"
 # }
 ```