EvanD commited on
Commit
2ddc5f9
·
1 Parent(s): 3f41611

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -93
README.md CHANGED
@@ -16,41 +16,18 @@ Same model as [flair/ner-dutch-large](https://huggingface.co/flair/ner-dutch-lar
16
 
17
 
18
  ```python
19
- import typing
20
- from flair.models.sequence_tagger_model import get_spans_from_bio # Optional see below code block
21
  from transformers import AutoTokenizer, AutoModelForTokenClassification
22
 
23
  tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
24
  ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
25
 
26
- batch = tokenizer.encode_plus("George Washington ging naar Washington", return_tensors="pt")
27
- batch = batch.to("cuda")
28
 
29
- with torch.inference_mode():
30
- outputs = ner_dutch(**batch)
31
-
32
- indices_labels = outputs.logits.argmax(dim=-1)[0].tolist()
33
- labels = [xlm_token_classif.config.id2label[idx] for idx in indices_labels]
34
- probabilities = [outputs.logits.softmax(dim=-1)[0].tolist()[idx][indices_labels[idx]] for idx in range(len(indices_labels))]
35
-
36
- spans = get_spans_from_bio(labels, probabilities)
37
-
38
- input_ids_sent = batch["input_ids"][0]
39
- for span in spans:
40
- entry = {
41
- "token_start_pos": span[0][0],
42
- "token_end_pos": span[0][-1],
43
- "start_pos": batch.token_to_chars(span[0][0]).start,
44
- "end_pos": batch.token_to_chars(span[0][-1]).end,
45
- "text": tokenizer.decode(input_ids_sent[span[0][0]:span[0][-1]+1]),
46
- "score": span[1],
47
- "label": span[2],
48
- }
49
- print(entry)
50
 
51
  # {
52
- # "token_start_pos": 1,
53
- # "token_end_pos": 2,
54
  # "start_pos": 0,
55
  # "end_pos": 17,
56
  # "text": "George Washington",
@@ -58,76 +35,10 @@ for span in spans:
58
  # "label": "PER"
59
  # }
60
  # {
61
- # "token_start_pos": 5,
62
- # "token_end_pos": 5,
63
  # "start_pos": 28,
64
  # "end_pos": 38,
65
  # "text": "Washington",
66
  # "score": 0.9999939203262329,
67
  # "label": "LOC"
68
  # }
69
-
70
-
71
- ```
72
-
73
-
74
- If you don't want to install flair, above function is the following:
75
- ```python
76
- def get_spans_from_bio(bioes_tags: List[str], bioes_scores=None) -> List[typing.Tuple[List[int], float, str]]:
77
- # add a dummy "O" to close final prediction
78
- bioes_tags.append("O")
79
- # return complex list
80
- found_spans = []
81
- # internal variables
82
- current_tag_weights: Dict[str, float] = {}
83
- previous_tag = "O-"
84
- current_span: List[int] = []
85
- current_span_scores: List[float] = []
86
- for idx, bioes_tag in enumerate(bioes_tags):
87
- # non-set tags are OUT tags
88
- if bioes_tag == "" or bioes_tag == "O" or bioes_tag == "_":
89
- bioes_tag = "O-"
90
-
91
- # anything that is not OUT is IN
92
- in_span = bioes_tag != "O-"
93
-
94
- # does this prediction start a new span?
95
- starts_new_span = False
96
-
97
- if bioes_tag[:2] in {"B-", "S-"} or (
98
- in_span and previous_tag[2:] != bioes_tag[2:] and (bioes_tag[:2] == "I-" or previous_tag[2:] == "S-")
99
- ):
100
- # B- and S- always start new spans
101
- # if the predicted class changes, I- starts a new span
102
- # if the predicted class changes and S- was previous tag, start a new span
103
- starts_new_span = True
104
-
105
- # if an existing span is ended (either by reaching O or starting a new span)
106
- if (starts_new_span or not in_span) and len(current_span) > 0:
107
- # determine score and value
108
- span_score = sum(current_span_scores) / len(current_span_scores)
109
- span_value = max(current_tag_weights.keys(), key=current_tag_weights.__getitem__)
110
-
111
- # append to result list
112
- found_spans.append((current_span, span_score, span_value))
113
-
114
- # reset for-loop variables for new span
115
- current_span = []
116
- current_span_scores = []
117
- current_tag_weights = {}
118
-
119
- if in_span:
120
- current_span.append(idx)
121
- current_span_scores.append(bioes_scores[idx] if bioes_scores else 1.0)
122
- weight = 1.1 if starts_new_span else 1.0
123
- current_tag_weights[bioes_tag[2:]] = current_tag_weights.setdefault(bioes_tag[2:], 0.0) + weight
124
-
125
- # remember previous tag
126
- previous_tag = bioes_tag
127
-
128
- return found_spans
129
-
130
  ```
131
-
132
-
133
-
 
16
 
17
 
18
  ```python
 
 
19
  from transformers import AutoTokenizer, AutoModelForTokenClassification
20
 
21
  tokenizer = AutoTokenizer.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
22
  ner_model = AutoModelForTokenClassification.from_pretrained("EvanD/dutch-ner-xlm-conll2003")
23
 
24
+ nlp = pipeline("ner", model=ner_model, tokenizer=tokenizer, grouped_entities=True)
25
+ example = "George Washington ging naar Washington"
26
 
27
+ ner_results = nlp(example)
28
+ print(ner_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # {
 
 
31
  # "start_pos": 0,
32
  # "end_pos": 17,
33
  # "text": "George Washington",
 
35
  # "label": "PER"
36
  # }
37
  # {
 
 
38
  # "start_pos": 28,
39
  # "end_pos": 38,
40
  # "text": "Washington",
41
  # "score": 0.9999939203262329,
42
  # "label": "LOC"
43
  # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  ```