InterpreTalk / backend /utils /text_rank.py
benjolo's picture
Upload 45 files
275976f verified
raw
history blame
No virus
1.24 kB
import spacy
import pytextrank
from spacy.tokens import Span
# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
def scrubber_func(span: Span) -> str:
return span.lemma_
return scrubber_func
# Load a spaCy model
nlp = spacy.load("en_core_web_lg")
# Exclude potential stopwords
nlp.Defaults.stop_words |= {"okay", "like"}
# Add TextRank component to pipeline with stopwords
nlp.add_pipe("textrank", config={
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
"scrubber": {"@misc": "plural_scrubber"}})
def extract_terms(text, length):
# Perform fact extraction on overall summary and segment summaries
doc = nlp(text)
if length < 200:
# Get single most used key term
phrases = {phrase.text for phrase in doc._.phrases[:1]}
elif length > 200 and length < 400:
# Create unique set from top 2 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:2]}
if length > 400:
# Create unique set from top 3 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:3]}
return list(phrases)