|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
from transformers import pipeline |
|
|
|
|
|
class NamedEntityRecognition: |
|
""" |
|
Named Entity Recognition on text data. |
|
Attributes: |
|
tokenizer: An instance of Hugging Face Tokenizer |
|
model: An instance of Hugging Face Model |
|
nlp: An instance of Hugging Face Named Entity Recognition pipeline |
|
""" |
|
|
|
def __init__(self): |
|
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english") |
|
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english") |
|
self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True) |
|
|
|
def get_annotation(self, preds, text): |
|
""" |
|
Get html annotation for displaying entities over text. |
|
Parameters: |
|
preds (dict): List of entities and their associated metadata |
|
text (str): The user input string to generate entity tags for |
|
Returns: |
|
final_annotation (list): List of tuples to pass to text annotation html creator |
|
""" |
|
|
|
splits = [0] |
|
entities = {} |
|
for i in preds: |
|
splits.append(i['start']) |
|
splits.append(i['end']) |
|
entities[i['word']] = i['entity_group'] |
|
|
|
|
|
exclude = ['', '.', '. ', ' '] |
|
for x in exclude: |
|
if x in entities.keys(): |
|
entities.pop(x) |
|
|
|
parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])] |
|
|
|
final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts] |
|
|
|
return final_annotation |
|
|
|
def classify(self, text): |
|
""" |
|
Recognize Named Entities in text. |
|
Parameters: |
|
text (str): The user input string to generate entity tags for |
|
Returns: |
|
predictions (str): The user input string to generate entity tags for |
|
ner_annotation (str): The user input string to generate entity tags for |
|
""" |
|
|
|
preds = self.nlp(text) |
|
ner_annotation = self.get_annotation(preds, text) |
|
return preds, ner_annotation |