|
from typing import Any, Dict, List, Optional |
|
|
|
from .operator import StreamInstanceOperator |
|
|
|
|
|
class IobExtractor(StreamInstanceOperator): |
|
"""A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location. |
|
|
|
Attributes: |
|
labels (List[str]): A list of entity type labels, e.g., ["Person", "Organization", "Location"]. |
|
begin_labels (List[str]): A list of labels indicating the beginning of an entity, e.g., ["B-PER", "B-ORG", "B-LOC"]. |
|
inside_labels (List[str]): A list of labels indicating the continuation of an entity, e.g., ["I-PER", "I-ORG", "I-LOC"]. |
|
outside_label (str): The label indicating tokens outside of any entity, typically "O". |
|
|
|
The extraction process identifies spans of text corresponding to entities and labels them according to their entity type. Each span is annotated with a start and end character offset, the entity text, and the corresponding label. |
|
|
|
Example of instantiation and usage: |
|
```python |
|
operator = IobExtractor( |
|
labels=["Person", "Organization", "Location"], |
|
begin_labels=["B-PER", "B-ORG", "B-LOC"], |
|
inside_labels=["I-PER", "I-ORG", "I-LOC"], |
|
outside_label="O", |
|
) |
|
|
|
instance = { |
|
"labels": ["B-PER", "I-PER", "O", "B-ORG", "I-ORG"], |
|
"tokens": ["John", "Doe", "works", "at", "OpenAI"] |
|
} |
|
processed_instance = operator.process(instance) |
|
print(processed_instance["spans"]) |
|
# Output: [{'start': 0, 'end': 8, 'text': 'John Doe', 'label': 'Person'}, ...] |
|
``` |
|
|
|
For more details on the IOB tagging convention, see: https://en.wikipedia.org/wiki/Inside-outside-beginning_(tagging) |
|
|
|
""" |
|
|
|
labels: List[str] |
|
begin_labels: List[str] |
|
inside_labels: List[str] |
|
outside_label: int |
|
|
|
def process( |
|
self, instance: Dict[str, Any], stream_name: Optional[str] = None |
|
) -> Dict[str, Any]: |
|
labels = instance["labels"] |
|
tokens = instance["tokens"] |
|
text = instance["text"] |
|
|
|
spans = [] |
|
current_pos = 0 |
|
end_pos = 0 |
|
|
|
for label, token in zip(labels, tokens): |
|
token_pos = text.find(token, current_pos) |
|
if token_pos == -1: |
|
raise ValueError( |
|
f"Token '{token}' not found in text '{text}' starting from position {current_pos}" |
|
) |
|
|
|
end_pos = token_pos + len(token) |
|
|
|
if label in self.begin_labels: |
|
span = { |
|
"start": token_pos, |
|
"label": self.labels[self.begin_labels.index(label)], |
|
"end": end_pos, |
|
} |
|
spans.append(span) |
|
elif label in self.inside_labels and spans: |
|
spans[-1]["end"] = end_pos |
|
|
|
current_pos = end_pos |
|
|
|
for span in spans: |
|
span["text"] = text[span["start"] : span["end"]] |
|
|
|
instance["spans"] = spans |
|
return instance |
|
|