Spaces:

benjolo
/

InterpreTalk

Paused

InterpreTalk / backend /utils /text_rank.py

Upload 45 files

275976f verified 4 months ago

No virus

1.24 kB

	import spacy
	import pytextrank
	from spacy.tokens import Span

	# Define decorator for converting to singular version of words
	@spacy.registry.misc("plural_scrubber")
	def plural_scrubber():
	def scrubber_func(span: Span) -> str:
	return span.lemma_
	return scrubber_func


	# Load a spaCy model
	nlp = spacy.load("en_core_web_lg")


	# Exclude potential stopwords
	nlp.Defaults.stop_words \|= {"okay", "like"}

	# Add TextRank component to pipeline with stopwords
	nlp.add_pipe("textrank", config={
	"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
	"scrubber": {"@misc": "plural_scrubber"}})


	def extract_terms(text, length):
	# Perform fact extraction on overall summary and segment summaries
	doc = nlp(text)

	if length < 200:
	# Get single most used key term
	phrases = {phrase.text for phrase in doc._.phrases[:1]}
	elif length > 200 and length < 400:
	# Create unique set from top 2 ranked phrases
	phrases = {phrase.text for phrase in doc._.phrases[:2]}
	if length > 400:
	# Create unique set from top 3 ranked phrases
	phrases = {phrase.text for phrase in doc._.phrases[:3]}

	return list(phrases)