import spacy import pytextrank from spacy.tokens import Span # Define decorator for converting to singular version of words @spacy.registry.misc("plural_scrubber") def plural_scrubber(): def scrubber_func(span: Span) -> str: return span.lemma_ return scrubber_func # Load a spaCy model nlp = spacy.load("en_core_web_lg") # Exclude potential stopwords nlp.Defaults.stop_words |= {"okay", "like"} # Add TextRank component to pipeline with stopwords nlp.add_pipe("textrank", config={ "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words}, "scrubber": {"@misc": "plural_scrubber"}}) def extract_terms(text, length): # Perform fact extraction on overall summary and segment summaries doc = nlp(text) if length < 200: # Get single most used key term phrases = {phrase.text for phrase in doc._.phrases[:1]} elif length > 200 and length < 400: # Create unique set from top 2 ranked phrases phrases = {phrase.text for phrase in doc._.phrases[:2]} if length > 400: # Create unique set from top 3 ranked phrases phrases = {phrase.text for phrase in doc._.phrases[:3]} return list(phrases)