ITACA_Insurace_NLP_v2 / keyword_extraction.py
danielperales's picture
Duplicate from dperales/ITACA_Insurance_Core_v4
27880c1
import nltk
import pytextrank
import re
from operator import itemgetter
import en_core_web_sm
class KeywordExtractor:
"""
Keyword Extraction on text data
Attributes:
nlp: An instance English pipeline optimized for CPU for spacy
"""
def __init__(self):
self.nlp = en_core_web_sm.load()
self.nlp.add_pipe("textrank")
def get_keywords(self, text, max_keywords):
"""
Extract keywords from text.
Parameters:
text (str): The user input string to extract keywords from
Returns:
kws (list): list of extracted keywords
"""
doc = self.nlp(text)
kws = [i.text for i in doc._.phrases[:max_keywords]]
return kws
def get_keyword_indices(self, kws, text):
"""
Extract keywords from text.
Parameters:
kws (list): list of extracted keywords
text (str): The user input string to extract keywords from
Returns:
keyword_indices (list): list of indices for keyword boundaries in text
"""
keyword_indices = []
for s in kws:
indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
keyword_indices.extend(indices)
return keyword_indices
def merge_overlapping_indices(self, keyword_indices):
"""
Merge overlapping keyword indices.
Parameters:
keyword_indices (list): list of indices for keyword boundaries in text
Returns:
keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
"""
# Sort the array on the basis of start values of intervals.
keyword_indices.sort()
stack = []
# insert first interval into stack
stack.append(keyword_indices[0])
for i in keyword_indices[1:]:
# Check for overlapping interval,
# if interval overlap
if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
stack[-1][-1] = max(stack[-1][-1], i[-1])
else:
stack.append(i)
return stack
def merge_until_finished(self, keyword_indices):
"""
Loop until no overlapping keyword indices left.
Parameters:
keyword_indices (list): list of indices for keyword boundaries in text
Returns:
keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
"""
len_indices = 0
while True:
# Merge overlapping indices
merged = self.merge_overlapping_indices(keyword_indices)
# Check to see if merging reduced number of annotation indices
# If merging did not reduce list return final indicies
if len_indices == len(merged):
out_indices = sorted(merged, key=itemgetter(0))
return out_indices
else:
len_indices = len(merged)
def get_annotation(self, text, keyword_indices):
"""
Create text annotation for extracted keywords.
Parameters:
keyword_indices (list): list of indices for keyword boundaries in text
Returns:
annotation (list): list of tuples for generating html
"""
# Turn list to numpy array
arr = list(text)
# Loop through indices in list and insert delimeters
for idx in sorted(keyword_indices, reverse=True):
arr.insert(idx[0], "<kw>")
arr.insert(idx[1]+1, "<!kw> <kw>")
# join array
joined_annotation = ''.join(arr)
# split array on delimeter
split = joined_annotation.split('<kw>')
# Create annotation for keywords in text
annotation = [(x.replace('<!kw> ', ''), "KEY", "#26aaef") if "<!kw>" in x else x for x in split]
return annotation
def generate(self, text, max_keywords):
"""
Create text annotation for extracted keywords.
Parameters:
text (str): The user input string to extract keywords from
max_keywords (int): Limit on number of keywords to generate
Returns:
annotation (list): list of tuples for generating html
kws (list): list of extracted keywords
"""
kws = self.get_keywords(text, max_keywords)
indices = list(self.get_keyword_indices(kws, text))
if indices:
indices_merged = self.merge_until_finished(indices)
annotation = self.get_annotation(text, indices_merged)
else:
annotation = None
return annotation, kws