Nihal D'Souza
Final app release
e41b03f
import pandas as pd
import spacy
import math
from collections import Counter
try:
from src.clean import clean_license_text
from src.parameters import color, vocab
except:
from clean import clean_license_text
from parameters import color, vocab
GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx"
LABELS_PATH = "data/choosealicense_appendix_labels.csv"
MIN_SENT_LEN = 3
SUMMARY_LEN = 0.3
nlp = spacy.load("en_core_web_sm")
def normalize_sentence_counter(counter):
"""
Normalize sentence scores in the counter between 0 and 3
Parameters
----------
counter : dict
A dictionary of scores with keys as sentence and values as raw scores.
Returns
-------
counter : dict
A dictionary of scores with keys as sentence and values as normalized
scores.
"""
vals = list(counter.values())
if vals:
min_val = min(vals)
max_val = max(vals)
else:
return counter
for sent in counter:
try:
counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3)
except:
counter[sent] = 0
return counter
def sent_tokenize_text(text, debug=False):
"""
Tokenize a license text into sentences
Parameters
----------
text : str
License text to be tokenized into sentences.
debug : bool, optional
Toggles debug mode. The default is False.
Returns
-------
tokenized_sents : list
A list of tokenized sentences.
"""
tokenized_sents = list()
paras = text.split("\n\n")
for para in paras:
for sent in nlp(para).sents:
sent = sent.text.replace("\n", "").strip()
if tokenized_sents and len(tokenized_sents[-1]) <= 30:
tokenized_sents[-1] += f" {sent.strip()}"
else:
tokenized_sents.append(sent.strip())
try:
tokenized_sents[-1] += "\n\n"
except:
pass
if debug:
print("Segmented Sentences:")
print("="*20)
for i, sent in enumerate(tokenized_sents):
print(f"Sent {i+1}")
print("-"*20)
print(sent)
print("-"*50)
print()
return tokenized_sents
def lemmatize_tokens(sent):
"""
Lemmatize tokens into the given sentence
Parameters
----------
sent : str
A sentences whose tokens are to be lemmatized.
Returns
-------
list
A list of lemmatized tokens.
"""
lemmas = list()
nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]
for tok_i, token in enumerate(nlp_sent):
if (token
and token not in vocab.license_stopwords
and token not in vocab.negation_words):
if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words:
lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
elif (tok_i > 1
and nlp_sent[tok_i-1] in " -"
and nlp_sent[tok_i-2] in vocab.negation_words):
lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
else:
lemmas.append(token)
return [lemma for lemma in lemmas if len(lemma) > 2]
def get_license_summary_scores(license_text,
min_sent_len=MIN_SENT_LEN,
summary_len=SUMMARY_LEN,
summary_in_text_order=True,
return_summary_only=True,
debug=False,
cleaned_license_sentences=None):
"""
Get sentence scores for all the cleaned sentences in a given license_text
along with other extracted details such as definitions, exceptions, etc.
and the cleaned license text itself.
Parameters
----------
license_text : str
License text.
min_sent_len : int, optional
The minimum number of tokens in a sentence for it to be considered.
The default is 3.
summary_len : float, optional
The proportion of length of the expected summary to the length of
license text. The default is 0.3.
summary_in_text_order : bool, optional
Toggle to switch between summary in text order or in descending order
by scores. The default is True.
return_summary_only : bool, optional
Toggle to return just the summary or entire license text with
important sentences highlighted. The default is True.
debug : bool, optional
Toggles debug mode. The default is False.
cleaned_license_sentences : list, optional
A list of cleaned sentences. The default is None.
Returns
-------
sent_scores : dict
A dictionary of sentence scores with keys as tuples of sentence and
sentence id and values as their normalized scores.
cleaned_license_sentences : list
A list of cleaned sentences.
definitions : str
Definitions extracted from license text.
exceptions : str
Exceptions extracted from license text.
summary_len : float
The proportion of length of the expected summary to the length of
license text.
"""
if not cleaned_license_sentences:
cleaned_license_text, definitions, exceptions = clean_license_text(license_text)
cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug)
else:
definitions, exceptions = "", ""
sent_scores = Counter()
summary_len = math.ceil(summary_len * len(cleaned_license_sentences))
if debug:
print(f"summary length:{summary_len}")
for sent_i, sent in enumerate(cleaned_license_sentences):
if len(sent.split()) < min_sent_len:
continue
score = 0
lemmatized_tokens = lemmatize_tokens(sent)
if debug:
print("-"*50)
print(f"\nOriginal Sentence = {sent}")
print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")
word_count = Counter([tok for tok in lemmatized_tokens])
for prop, prop_words in vocab.properties_dict.items():
prop_score = 0
imp_words = list()
for prop_word in prop_words:
if prop_word in word_count.keys():
prop_score += vocab.properties_scores[prop]
imp_words.append(prop_word)
if debug:
print(prop, "=", imp_words, "=", prop_score)
score += prop_score
# With normalization
# sent_scores[(sent, sent_i)] = score / len(lemmatized_tokens)
# Without normalization
sent_scores[(sent, sent_i)] = score
if debug:
print(f"Sentence score: {sent_scores[(sent, sent_i)]}")
print()
sent_scores = normalize_sentence_counter(sent_scores)
if debug:
print(sent_scores)
return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len
def get_sent_scores(license_text,
min_sent_len=MIN_SENT_LEN,
summary_len=SUMMARY_LEN,
summary_in_text_order=True,
return_summary_only=True,
debug=False,
cleaned_license_sentences=None):
"""
Get sentence scores for all the sentences in a given license_text along
with their sentence ids.
Parameters
----------
license_text : str
License text.
min_sent_len : int, optional
The minimum number of tokens in a sentence for it to be considered.
The default is 3.
summary_len : float, optional
The proportion of length of the expected summary to the length of
license text. The default is 0.3.
summary_in_text_order : bool, optional
Toggle to switch between summary in text order or in descending order
by scores. The default is True.
return_summary_only : bool, optional
Toggle to return just the summary or entire license text with
important sentences highlighted. The default is True.
debug : bool, optional
Toggles debug mode. The default is False.
cleaned_license_sentences : list, optional
A list of cleaned sentences. The default is None.
Returns
-------
sent_id_scores : list(tuple)
A list of tuples of sentence id and sentence score.
"""
sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
license_text,
min_sent_len=min_sent_len,
summary_len=summary_len,
summary_in_text_order=summary_in_text_order,
return_summary_only=return_summary_only,
debug=debug,
cleaned_license_sentences=cleaned_license_sentences
)
sent_id_scores = [
(sent_i, score) for (sent_id, sent_i), score in sent_scores.items()
]
return sent_id_scores
def custom_textrank_summarizer(license_text,
min_sent_len=MIN_SENT_LEN,
summary_len=SUMMARY_LEN,
summary_in_text_order=True,
return_summary_only=True,
debug=False):
"""
Returns summary / highlighted summary, definitions and exceptions for a
given license_text.
Parameters
----------
license_text : str
License text.
min_sent_len : int, optional
The minimum number of tokens in a sentence for it to be considered.
The default is 3.
summary_len : float, optional
The proportion of length of the expected summary to the length of
license text. The default is 0.3.
summary_in_text_order : bool, optional
Toggle to switch between summary in text order or in descending order
by scores. The default is True.
return_summary_only : bool, optional
Toggle to return just the summary or entire license text with
important sentences highlighted. The default is True.
debug : bool, optional
Toggles debug mode. The default is False.
Returns
-------
str
Summary or the highlighted license text.
definitions : str
Definitions extracted from license text.
exceptions : str
Exceptions extracted from license text.
"""
sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
license_text,
min_sent_len=min_sent_len,
summary_len=summary_len,
summary_in_text_order=summary_in_text_order,
return_summary_only=return_summary_only,
debug=debug
)
sorted_sent_scores = sent_scores.most_common()[:summary_len]
if summary_in_text_order:
sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1])
summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order)
selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order)
else:
summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores)
selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores)
highlighted_license_text = " ".join(
f"""<mark style="color: {color.BLACK}; background-color:{color.GREEN}">{sent}</mark>"""
if sent_i in selected_sent_ids
else sent
for sent_i, sent in enumerate(cleaned_license_sentences)
)
if debug:
print("="*50)
print("License Text:")
print("-"*30)
print(highlighted_license_text)
print("="*50)
definitions = definitions.strip("\n.") + "."
if return_summary_only:
return summary, definitions, exceptions
else:
return highlighted_license_text, definitions, exceptions
def get_system_scores(attachment_id=None):
"""
Get system sentence scores for all the sentences in all licenses in gold
standard.
Parameters
----------
attachment_id : str, optional
The attachment id of the document for which the sentence scores are to
be calculated. If None, the sentence scores for all the documents will
be returned. The default is None.
Returns
-------
scores_dict : dict
A dictionary of all the scores with keys as the attachment id of a
document and values as a list of tuples of sentence id and scores for
that attachment id.
"""
gold_data = pd.read_excel(GOLD_STANDARD_PATH)
gold_data = gold_data[["attachment_id", "sentence"]]
sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list)
scores_dict = dict()
if attachment_id:
scores_dict[attachment_id] = get_sent_scores(
"",
summary_len=SUMMARY_LEN,
cleaned_license_sentences=sent_lists[attachment_id]
)
return scores_dict
for attachment_id, cleaned_license_sentences in dict(sent_lists).items():
scores_dict[attachment_id] = get_sent_scores(
"",
summary_len=SUMMARY_LEN,
cleaned_license_sentences=cleaned_license_sentences
)
return scores_dict
def preprocess_properties(cell):
"""
Converts licnse properties to title case and removes hyphens and
underscores.
Parameters
----------
cell : str
A cell string in properties dataframe of a license.
Returns
-------
cell : TYPE
DESCRIPTION.
"""
try:
cell = cell.replace("--", "$")
cell = cell.replace("-", " ")
cell = cell.replace("_", " ")
cell = cell.replace("$", " - ").title()
except:
pass
return cell
def get_labels_for_license(license_id, by_license_id=True):
"""
Gets license properties for a given license_id.
Parameters
----------
license_id : str
License id of the license for which properties are to be returned.
by_license_id : bool, optional
A flag to decide whether we fetch the license properties by license id
or license name. The default is True.
Returns
-------
properties : pandas.DataFrame
Dataframe with properties of the license with id license_id.
"""
index_col = 0 if by_license_id else 1
columns = ["Property", "Label"]
labels_data = pd.read_csv(LABELS_PATH, index_col=index_col)
properties = pd.DataFrame(labels_data.loc[license_id]).reset_index()
properties.columns = columns
properties = properties.applymap(preprocess_properties)
return properties