Elron commited on
Commit
a2d4f85
1 Parent(s): 63ac409

Upload span_lableing_operators.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. span_lableing_operators.py +80 -0
span_lableing_operators.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from .operator import StreamInstanceOperator
4
+
5
+
6
+ class IobExtractor(StreamInstanceOperator):
7
+ """A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location.
8
+
9
+ Attributes:
10
+ labels (List[str]): A list of entity type labels, e.g., ["Person", "Organization", "Location"].
11
+ begin_labels (List[str]): A list of labels indicating the beginning of an entity, e.g., ["B-PER", "B-ORG", "B-LOC"].
12
+ inside_labels (List[str]): A list of labels indicating the continuation of an entity, e.g., ["I-PER", "I-ORG", "I-LOC"].
13
+ outside_label (str): The label indicating tokens outside of any entity, typically "O".
14
+
15
+ The extraction process identifies spans of text corresponding to entities and labels them according to their entity type. Each span is annotated with a start and end character offset, the entity text, and the corresponding label.
16
+
17
+ Example of instantiation and usage:
18
+ ```python
19
+ operator = IobExtractor(
20
+ labels=["Person", "Organization", "Location"],
21
+ begin_labels=["B-PER", "B-ORG", "B-LOC"],
22
+ inside_labels=["I-PER", "I-ORG", "I-LOC"],
23
+ outside_label="O",
24
+ )
25
+
26
+ instance = {
27
+ "labels": ["B-PER", "I-PER", "O", "B-ORG", "I-ORG"],
28
+ "tokens": ["John", "Doe", "works", "at", "OpenAI"]
29
+ }
30
+ processed_instance = operator.process(instance)
31
+ print(processed_instance["spans"])
32
+ # Output: [{'start': 0, 'end': 8, 'text': 'John Doe', 'label': 'Person'}, ...]
33
+ ```
34
+
35
+ For more details on the IOB tagging convention, see: https://en.wikipedia.org/wiki/Inside-outside-beginning_(tagging)
36
+
37
+ """
38
+
39
+ labels: List[str]
40
+ begin_labels: List[str]
41
+ inside_labels: List[str]
42
+ outside_label: int
43
+
44
+ def process(
45
+ self, instance: Dict[str, Any], stream_name: Optional[str] = None
46
+ ) -> Dict[str, Any]:
47
+ labels = instance["labels"]
48
+ tokens = instance["tokens"]
49
+ text = instance["text"]
50
+
51
+ spans = []
52
+ current_pos = 0
53
+ end_pos = 0
54
+
55
+ for label, token in zip(labels, tokens):
56
+ token_pos = text.find(token, current_pos)
57
+ if token_pos == -1:
58
+ raise ValueError(
59
+ f"Token '{token}' not found in text '{text}' starting from position {current_pos}"
60
+ )
61
+
62
+ end_pos = token_pos + len(token)
63
+
64
+ if label in self.begin_labels:
65
+ span = {
66
+ "start": token_pos,
67
+ "label": self.labels[self.begin_labels.index(label)],
68
+ "end": end_pos,
69
+ }
70
+ spans.append(span)
71
+ elif label in self.inside_labels and spans:
72
+ spans[-1]["end"] = end_pos
73
+
74
+ current_pos = end_pos
75
+
76
+ for span in spans:
77
+ span["text"] = text[span["start"] : span["end"]]
78
+
79
+ instance["spans"] = spans
80
+ return instance