text_summarisation_demo / src /text_rank_summarizer.py
hughustla's picture
Add application files
5a60200
import spacy
import pytextrank
from math import sqrt
from operator import itemgetter
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('textrank')
def _phrase_vector(doc):
phrase_id = 0
unit_vector = []
sent_bounds = [[s.start, s.end, set([])] for s in doc.sents]
for p in doc._.phrases:
unit_vector.append(p.rank)
for chunk in p.chunks:
for sent_start, sent_end, sent_vector in sent_bounds:
if chunk.start >= sent_start and chunk.end <= sent_end:
sent_vector.add(phrase_id)
break
phrase_id += 1
sum_ranks = sum(unit_vector)
return [rank / sum_ranks for rank in unit_vector], sent_bounds
def _sent_rank(unit_vector, sent_bounds):
sent_rank = {}
sent_id = 0
for sent_start, sent_end, sent_vector in sent_bounds:
sum_sq = 0.0
for phrase_id in range(len(unit_vector)):
if phrase_id not in sent_vector:
sum_sq += unit_vector[phrase_id] ** 2.0
sent_rank[sent_id] = sqrt(sum_sq)
sent_id += 1
return sent_rank
def _rank_to_summary(sent_rank, doc, summary_lines):
sent_text = {}
sent_id = 0
for sent in doc.sents:
sent_text[sent_id] = sent.text
sent_id += 1
summary = []
num_sent = 0
for sent_id, _ in sent_rank:
num_sent += 1
summary.append(sent_text[sent_id])
if num_sent == summary_lines:
break
return ' '.join(summary)
def summarize(text, summary_lines):
doc = nlp(text)
phrase_vector, sent_bounds = _phrase_vector(doc)
sent_rank = sorted(_sent_rank(phrase_vector, sent_bounds).items(), key=itemgetter(1))
return _rank_to_summary(sent_rank, doc, summary_lines)