Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /textrank.py

Nihal D'Souza

Final app release

e41b03f almost 2 years ago

No virus

14.7 kB

	import pandas as pd
	import spacy
	import math
	from collections import Counter


	try:
	from src.clean import clean_license_text
	from src.parameters import color, vocab
	except:
	from clean import clean_license_text
	from parameters import color, vocab


	GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx"
	LABELS_PATH = "data/choosealicense_appendix_labels.csv"
	MIN_SENT_LEN = 3
	SUMMARY_LEN = 0.3

	nlp = spacy.load("en_core_web_sm")


	def normalize_sentence_counter(counter):
	"""
	Normalize sentence scores in the counter between 0 and 3

	Parameters
	----------
	counter : dict
	A dictionary of scores with keys as sentence and values as raw scores.

	Returns
	-------
	counter : dict
	A dictionary of scores with keys as sentence and values as normalized
	scores.

	"""
	vals = list(counter.values())

	if vals:
	min_val = min(vals)
	max_val = max(vals)
	else:
	return counter

	for sent in counter:
	try:
	counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3)
	except:
	counter[sent] = 0
	return counter


	def sent_tokenize_text(text, debug=False):
	"""
	Tokenize a license text into sentences

	Parameters
	----------
	text : str
	License text to be tokenized into sentences.
	debug : bool, optional
	Toggles debug mode. The default is False.

	Returns
	-------
	tokenized_sents : list
	A list of tokenized sentences.

	"""
	tokenized_sents = list()
	paras = text.split("\n\n")
	for para in paras:
	for sent in nlp(para).sents:
	sent = sent.text.replace("\n", "").strip()
	if tokenized_sents and len(tokenized_sents[-1]) <= 30:
	tokenized_sents[-1] += f" {sent.strip()}"
	else:
	tokenized_sents.append(sent.strip())
	try:
	tokenized_sents[-1] += "\n\n"
	except:
	pass
	if debug:
	print("Segmented Sentences:")
	print("="*20)
	for i, sent in enumerate(tokenized_sents):
	print(f"Sent {i+1}")
	print("-"*20)
	print(sent)
	print("-"*50)
	print()
	return tokenized_sents


	def lemmatize_tokens(sent):
	"""
	Lemmatize tokens into the given sentence

	Parameters
	----------
	sent : str
	A sentences whose tokens are to be lemmatized.

	Returns
	-------
	list
	A list of lemmatized tokens.

	"""
	lemmas = list()

	nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]

	for tok_i, token in enumerate(nlp_sent):
	if (token
	and token not in vocab.license_stopwords
	and token not in vocab.negation_words):
	if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words:
	lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
	elif (tok_i > 1
	and nlp_sent[tok_i-1] in " -"
	and nlp_sent[tok_i-2] in vocab.negation_words):
	lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
	else:
	lemmas.append(token)

	return [lemma for lemma in lemmas if len(lemma) > 2]


	def get_license_summary_scores(license_text,
	min_sent_len=MIN_SENT_LEN,
	summary_len=SUMMARY_LEN,
	summary_in_text_order=True,
	return_summary_only=True,
	debug=False,
	cleaned_license_sentences=None):
	"""
	Get sentence scores for all the cleaned sentences in a given license_text
	along with other extracted details such as definitions, exceptions, etc.
	and the cleaned license text itself.

	Parameters
	----------
	license_text : str
	License text.
	min_sent_len : int, optional
	The minimum number of tokens in a sentence for it to be considered.
	The default is 3.
	summary_len : float, optional
	The proportion of length of the expected summary to the length of
	license text. The default is 0.3.
	summary_in_text_order : bool, optional
	Toggle to switch between summary in text order or in descending order
	by scores. The default is True.
	return_summary_only : bool, optional
	Toggle to return just the summary or entire license text with
	important sentences highlighted. The default is True.
	debug : bool, optional
	Toggles debug mode. The default is False.
	cleaned_license_sentences : list, optional
	A list of cleaned sentences. The default is None.

	Returns
	-------
	sent_scores : dict
	A dictionary of sentence scores with keys as tuples of sentence and
	sentence id and values as their normalized scores.
	cleaned_license_sentences : list
	A list of cleaned sentences.
	definitions : str
	Definitions extracted from license text.
	exceptions : str
	Exceptions extracted from license text.
	summary_len : float
	The proportion of length of the expected summary to the length of
	license text.

	"""

	if not cleaned_license_sentences:
	cleaned_license_text, definitions, exceptions = clean_license_text(license_text)
	cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug)
	else:
	definitions, exceptions = "", ""

	sent_scores = Counter()

	summary_len = math.ceil(summary_len * len(cleaned_license_sentences))

	if debug:
	print(f"summary length:{summary_len}")

	for sent_i, sent in enumerate(cleaned_license_sentences):

	if len(sent.split()) < min_sent_len:
	continue

	score = 0

	lemmatized_tokens = lemmatize_tokens(sent)

	if debug:
	print("-"*50)
	print(f"\nOriginal Sentence = {sent}")
	print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")

	word_count = Counter([tok for tok in lemmatized_tokens])

	for prop, prop_words in vocab.properties_dict.items():
	prop_score = 0

	imp_words = list()

	for prop_word in prop_words:
	if prop_word in word_count.keys():
	prop_score += vocab.properties_scores[prop]
	imp_words.append(prop_word)

	if debug:
	print(prop, "=", imp_words, "=", prop_score)

	score += prop_score

	# With normalization
	# sent_scores[(sent, sent_i)] = score / len(lemmatized_tokens)

	# Without normalization
	sent_scores[(sent, sent_i)] = score

	if debug:
	print(f"Sentence score: {sent_scores[(sent, sent_i)]}")
	print()

	sent_scores = normalize_sentence_counter(sent_scores)

	if debug:
	print(sent_scores)

	return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len


	def get_sent_scores(license_text,
	min_sent_len=MIN_SENT_LEN,
	summary_len=SUMMARY_LEN,
	summary_in_text_order=True,
	return_summary_only=True,
	debug=False,
	cleaned_license_sentences=None):
	"""
	Get sentence scores for all the sentences in a given license_text along
	with their sentence ids.

	Parameters
	----------
	license_text : str
	License text.
	min_sent_len : int, optional
	The minimum number of tokens in a sentence for it to be considered.
	The default is 3.
	summary_len : float, optional
	The proportion of length of the expected summary to the length of
	license text. The default is 0.3.
	summary_in_text_order : bool, optional
	Toggle to switch between summary in text order or in descending order
	by scores. The default is True.
	return_summary_only : bool, optional
	Toggle to return just the summary or entire license text with
	important sentences highlighted. The default is True.
	debug : bool, optional
	Toggles debug mode. The default is False.
	cleaned_license_sentences : list, optional
	A list of cleaned sentences. The default is None.

	Returns
	-------
	sent_id_scores : list(tuple)
	A list of tuples of sentence id and sentence score.

	"""
	sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
	license_text,
	min_sent_len=min_sent_len,
	summary_len=summary_len,
	summary_in_text_order=summary_in_text_order,
	return_summary_only=return_summary_only,
	debug=debug,
	cleaned_license_sentences=cleaned_license_sentences
	)

	sent_id_scores = [
	(sent_i, score) for (sent_id, sent_i), score in sent_scores.items()
	]

	return sent_id_scores


	def custom_textrank_summarizer(license_text,
	min_sent_len=MIN_SENT_LEN,
	summary_len=SUMMARY_LEN,
	summary_in_text_order=True,
	return_summary_only=True,
	debug=False):
	"""
	Returns summary / highlighted summary, definitions and exceptions for a
	given license_text.

	Parameters
	----------
	license_text : str
	License text.
	min_sent_len : int, optional
	The minimum number of tokens in a sentence for it to be considered.
	The default is 3.
	summary_len : float, optional
	The proportion of length of the expected summary to the length of
	license text. The default is 0.3.
	summary_in_text_order : bool, optional
	Toggle to switch between summary in text order or in descending order
	by scores. The default is True.
	return_summary_only : bool, optional
	Toggle to return just the summary or entire license text with
	important sentences highlighted. The default is True.
	debug : bool, optional
	Toggles debug mode. The default is False.

	Returns
	-------
	str
	Summary or the highlighted license text.
	definitions : str
	Definitions extracted from license text.
	exceptions : str
	Exceptions extracted from license text.

	"""

	sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
	license_text,
	min_sent_len=min_sent_len,
	summary_len=summary_len,
	summary_in_text_order=summary_in_text_order,
	return_summary_only=return_summary_only,
	debug=debug
	)

	sorted_sent_scores = sent_scores.most_common()[:summary_len]

	if summary_in_text_order:
	sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1])
	summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order)
	selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order)
	else:
	summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores)
	selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores)

	highlighted_license_text = " ".join(
	f"""<mark style="color: {color.BLACK}; background-color:{color.GREEN}">{sent}</mark>"""
	if sent_i in selected_sent_ids
	else sent
	for sent_i, sent in enumerate(cleaned_license_sentences)
	)

	if debug:
	print("="*50)
	print("License Text:")
	print("-"*30)
	print(highlighted_license_text)
	print("="*50)

	definitions = definitions.strip("\n.") + "."

	if return_summary_only:
	return summary, definitions, exceptions
	else:
	return highlighted_license_text, definitions, exceptions


	def get_system_scores(attachment_id=None):
	"""
	Get system sentence scores for all the sentences in all licenses in gold
	standard.

	Parameters
	----------
	attachment_id : str, optional
	The attachment id of the document for which the sentence scores are to
	be calculated. If None, the sentence scores for all the documents will
	be returned. The default is None.

	Returns
	-------
	scores_dict : dict
	A dictionary of all the scores with keys as the attachment id of a
	document and values as a list of tuples of sentence id and scores for
	that attachment id.

	"""
	gold_data = pd.read_excel(GOLD_STANDARD_PATH)
	gold_data = gold_data[["attachment_id", "sentence"]]
	sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list)

	scores_dict = dict()

	if attachment_id:
	scores_dict[attachment_id] = get_sent_scores(
	"",
	summary_len=SUMMARY_LEN,
	cleaned_license_sentences=sent_lists[attachment_id]
	)
	return scores_dict

	for attachment_id, cleaned_license_sentences in dict(sent_lists).items():

	scores_dict[attachment_id] = get_sent_scores(
	"",
	summary_len=SUMMARY_LEN,
	cleaned_license_sentences=cleaned_license_sentences
	)

	return scores_dict


	def preprocess_properties(cell):
	"""
	Converts licnse properties to title case and removes hyphens and
	underscores.

	Parameters
	----------
	cell : str
	A cell string in properties dataframe of a license.

	Returns
	-------
	cell : TYPE
	DESCRIPTION.

	"""
	try:
	cell = cell.replace("--", "$")
	cell = cell.replace("-", " ")
	cell = cell.replace("_", " ")
	cell = cell.replace("$", " - ").title()
	except:
	pass
	return cell

	def get_labels_for_license(license_id, by_license_id=True):
	"""
	Gets license properties for a given license_id.

	Parameters
	----------
	license_id : str
	License id of the license for which properties are to be returned.
	by_license_id : bool, optional
	A flag to decide whether we fetch the license properties by license id
	or license name. The default is True.

	Returns
	-------
	properties : pandas.DataFrame
	Dataframe with properties of the license with id license_id.

	"""
	index_col = 0 if by_license_id else 1
	columns = ["Property", "Label"]
	labels_data = pd.read_csv(LABELS_PATH, index_col=index_col)
	properties = pd.DataFrame(labels_data.loc[license_id]).reset_index()
	properties.columns = columns
	properties = properties.applymap(preprocess_properties)
	return properties