ITACA_Insurace_NLP_v2 / named_entity_recognition.py
danielperales's picture
Duplicate from dperales/ITACA_Insurance_Core_v4
27880c1
raw
history blame
2.19 kB
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
class NamedEntityRecognition:
"""
Named Entity Recognition on text data.
Attributes:
tokenizer: An instance of Hugging Face Tokenizer
model: An instance of Hugging Face Model
nlp: An instance of Hugging Face Named Entity Recognition pipeline
"""
def __init__(self):
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
def get_annotation(self, preds, text):
"""
Get html annotation for displaying entities over text.
Parameters:
preds (dict): List of entities and their associated metadata
text (str): The user input string to generate entity tags for
Returns:
final_annotation (list): List of tuples to pass to text annotation html creator
"""
splits = [0]
entities = {}
for i in preds:
splits.append(i['start'])
splits.append(i['end'])
entities[i['word']] = i['entity_group']
# Exclude bad preds
exclude = ['', '.', '. ', ' ']
for x in exclude:
if x in entities.keys():
entities.pop(x)
parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]
final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]
return final_annotation
def classify(self, text):
"""
Recognize Named Entities in text.
Parameters:
text (str): The user input string to generate entity tags for
Returns:
predictions (str): The user input string to generate entity tags for
ner_annotation (str): The user input string to generate entity tags for
"""
preds = self.nlp(text)
ner_annotation = self.get_annotation(preds, text)
return preds, ner_annotation