Spaces:
Runtime error
Runtime error
import pandas as pd | |
import spacy | |
import math | |
from collections import Counter | |
try: | |
from src.clean import clean_license_text | |
from src.parameters import color, vocab | |
except: | |
from clean import clean_license_text | |
from parameters import color, vocab | |
GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx" | |
LABELS_PATH = "data/choosealicense_appendix_labels.csv" | |
MIN_SENT_LEN = 3 | |
SUMMARY_LEN = 0.3 | |
nlp = spacy.load("en_core_web_sm") | |
def normalize_sentence_counter(counter): | |
""" | |
Normalize sentence scores in the counter between 0 and 3 | |
Parameters | |
---------- | |
counter : dict | |
A dictionary of scores with keys as sentence and values as raw scores. | |
Returns | |
------- | |
counter : dict | |
A dictionary of scores with keys as sentence and values as normalized | |
scores. | |
""" | |
vals = list(counter.values()) | |
if vals: | |
min_val = min(vals) | |
max_val = max(vals) | |
else: | |
return counter | |
for sent in counter: | |
try: | |
counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3) | |
except: | |
counter[sent] = 0 | |
return counter | |
def sent_tokenize_text(text, debug=False): | |
""" | |
Tokenize a license text into sentences | |
Parameters | |
---------- | |
text : str | |
License text to be tokenized into sentences. | |
debug : bool, optional | |
Toggles debug mode. The default is False. | |
Returns | |
------- | |
tokenized_sents : list | |
A list of tokenized sentences. | |
""" | |
tokenized_sents = list() | |
paras = text.split("\n\n") | |
for para in paras: | |
for sent in nlp(para).sents: | |
sent = sent.text.replace("\n", "").strip() | |
if tokenized_sents and len(tokenized_sents[-1]) <= 30: | |
tokenized_sents[-1] += f" {sent.strip()}" | |
else: | |
tokenized_sents.append(sent.strip()) | |
try: | |
tokenized_sents[-1] += "\n\n" | |
except: | |
pass | |
if debug: | |
print("Segmented Sentences:") | |
print("="*20) | |
for i, sent in enumerate(tokenized_sents): | |
print(f"Sent {i+1}") | |
print("-"*20) | |
print(sent) | |
print("-"*50) | |
print() | |
return tokenized_sents | |
def lemmatize_tokens(sent): | |
""" | |
Lemmatize tokens into the given sentence | |
Parameters | |
---------- | |
sent : str | |
A sentences whose tokens are to be lemmatized. | |
Returns | |
------- | |
list | |
A list of lemmatized tokens. | |
""" | |
lemmas = list() | |
nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)] | |
for tok_i, token in enumerate(nlp_sent): | |
if (token | |
and token not in vocab.license_stopwords | |
and token not in vocab.negation_words): | |
if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words: | |
lemmas.append(f"{nlp_sent[tok_i-1]}-{token}") | |
elif (tok_i > 1 | |
and nlp_sent[tok_i-1] in " -" | |
and nlp_sent[tok_i-2] in vocab.negation_words): | |
lemmas.append(f"{nlp_sent[tok_i-2]}-{token}") | |
else: | |
lemmas.append(token) | |
return [lemma for lemma in lemmas if len(lemma) > 2] | |
def get_license_summary_scores(license_text, | |
min_sent_len=MIN_SENT_LEN, | |
summary_len=SUMMARY_LEN, | |
summary_in_text_order=True, | |
return_summary_only=True, | |
debug=False, | |
cleaned_license_sentences=None): | |
""" | |
Get sentence scores for all the cleaned sentences in a given license_text | |
along with other extracted details such as definitions, exceptions, etc. | |
and the cleaned license text itself. | |
Parameters | |
---------- | |
license_text : str | |
License text. | |
min_sent_len : int, optional | |
The minimum number of tokens in a sentence for it to be considered. | |
The default is 3. | |
summary_len : float, optional | |
The proportion of length of the expected summary to the length of | |
license text. The default is 0.3. | |
summary_in_text_order : bool, optional | |
Toggle to switch between summary in text order or in descending order | |
by scores. The default is True. | |
return_summary_only : bool, optional | |
Toggle to return just the summary or entire license text with | |
important sentences highlighted. The default is True. | |
debug : bool, optional | |
Toggles debug mode. The default is False. | |
cleaned_license_sentences : list, optional | |
A list of cleaned sentences. The default is None. | |
Returns | |
------- | |
sent_scores : dict | |
A dictionary of sentence scores with keys as tuples of sentence and | |
sentence id and values as their normalized scores. | |
cleaned_license_sentences : list | |
A list of cleaned sentences. | |
definitions : str | |
Definitions extracted from license text. | |
exceptions : str | |
Exceptions extracted from license text. | |
summary_len : float | |
The proportion of length of the expected summary to the length of | |
license text. | |
""" | |
if not cleaned_license_sentences: | |
cleaned_license_text, definitions, exceptions = clean_license_text(license_text) | |
cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug) | |
else: | |
definitions, exceptions = "", "" | |
sent_scores = Counter() | |
summary_len = math.ceil(summary_len * len(cleaned_license_sentences)) | |
if debug: | |
print(f"summary length:{summary_len}") | |
for sent_i, sent in enumerate(cleaned_license_sentences): | |
if len(sent.split()) < min_sent_len: | |
continue | |
score = 0 | |
lemmatized_tokens = lemmatize_tokens(sent) | |
if debug: | |
print("-"*50) | |
print(f"\nOriginal Sentence = {sent}") | |
print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}") | |
word_count = Counter([tok for tok in lemmatized_tokens]) | |
for prop, prop_words in vocab.properties_dict.items(): | |
prop_score = 0 | |
imp_words = list() | |
for prop_word in prop_words: | |
if prop_word in word_count.keys(): | |
prop_score += vocab.properties_scores[prop] | |
imp_words.append(prop_word) | |
if debug: | |
print(prop, "=", imp_words, "=", prop_score) | |
score += prop_score | |
# With normalization | |
# sent_scores[(sent, sent_i)] = score / len(lemmatized_tokens) | |
# Without normalization | |
sent_scores[(sent, sent_i)] = score | |
if debug: | |
print(f"Sentence score: {sent_scores[(sent, sent_i)]}") | |
print() | |
sent_scores = normalize_sentence_counter(sent_scores) | |
if debug: | |
print(sent_scores) | |
return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len | |
def get_sent_scores(license_text, | |
min_sent_len=MIN_SENT_LEN, | |
summary_len=SUMMARY_LEN, | |
summary_in_text_order=True, | |
return_summary_only=True, | |
debug=False, | |
cleaned_license_sentences=None): | |
""" | |
Get sentence scores for all the sentences in a given license_text along | |
with their sentence ids. | |
Parameters | |
---------- | |
license_text : str | |
License text. | |
min_sent_len : int, optional | |
The minimum number of tokens in a sentence for it to be considered. | |
The default is 3. | |
summary_len : float, optional | |
The proportion of length of the expected summary to the length of | |
license text. The default is 0.3. | |
summary_in_text_order : bool, optional | |
Toggle to switch between summary in text order or in descending order | |
by scores. The default is True. | |
return_summary_only : bool, optional | |
Toggle to return just the summary or entire license text with | |
important sentences highlighted. The default is True. | |
debug : bool, optional | |
Toggles debug mode. The default is False. | |
cleaned_license_sentences : list, optional | |
A list of cleaned sentences. The default is None. | |
Returns | |
------- | |
sent_id_scores : list(tuple) | |
A list of tuples of sentence id and sentence score. | |
""" | |
sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores( | |
license_text, | |
min_sent_len=min_sent_len, | |
summary_len=summary_len, | |
summary_in_text_order=summary_in_text_order, | |
return_summary_only=return_summary_only, | |
debug=debug, | |
cleaned_license_sentences=cleaned_license_sentences | |
) | |
sent_id_scores = [ | |
(sent_i, score) for (sent_id, sent_i), score in sent_scores.items() | |
] | |
return sent_id_scores | |
def custom_textrank_summarizer(license_text, | |
min_sent_len=MIN_SENT_LEN, | |
summary_len=SUMMARY_LEN, | |
summary_in_text_order=True, | |
return_summary_only=True, | |
debug=False): | |
""" | |
Returns summary / highlighted summary, definitions and exceptions for a | |
given license_text. | |
Parameters | |
---------- | |
license_text : str | |
License text. | |
min_sent_len : int, optional | |
The minimum number of tokens in a sentence for it to be considered. | |
The default is 3. | |
summary_len : float, optional | |
The proportion of length of the expected summary to the length of | |
license text. The default is 0.3. | |
summary_in_text_order : bool, optional | |
Toggle to switch between summary in text order or in descending order | |
by scores. The default is True. | |
return_summary_only : bool, optional | |
Toggle to return just the summary or entire license text with | |
important sentences highlighted. The default is True. | |
debug : bool, optional | |
Toggles debug mode. The default is False. | |
Returns | |
------- | |
str | |
Summary or the highlighted license text. | |
definitions : str | |
Definitions extracted from license text. | |
exceptions : str | |
Exceptions extracted from license text. | |
""" | |
sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores( | |
license_text, | |
min_sent_len=min_sent_len, | |
summary_len=summary_len, | |
summary_in_text_order=summary_in_text_order, | |
return_summary_only=return_summary_only, | |
debug=debug | |
) | |
sorted_sent_scores = sent_scores.most_common()[:summary_len] | |
if summary_in_text_order: | |
sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1]) | |
summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order) | |
selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order) | |
else: | |
summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores) | |
selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores) | |
highlighted_license_text = " ".join( | |
f"""<mark style="color: {color.BLACK}; background-color:{color.GREEN}">{sent}</mark>""" | |
if sent_i in selected_sent_ids | |
else sent | |
for sent_i, sent in enumerate(cleaned_license_sentences) | |
) | |
if debug: | |
print("="*50) | |
print("License Text:") | |
print("-"*30) | |
print(highlighted_license_text) | |
print("="*50) | |
definitions = definitions.strip("\n.") + "." | |
if return_summary_only: | |
return summary, definitions, exceptions | |
else: | |
return highlighted_license_text, definitions, exceptions | |
def get_system_scores(attachment_id=None): | |
""" | |
Get system sentence scores for all the sentences in all licenses in gold | |
standard. | |
Parameters | |
---------- | |
attachment_id : str, optional | |
The attachment id of the document for which the sentence scores are to | |
be calculated. If None, the sentence scores for all the documents will | |
be returned. The default is None. | |
Returns | |
------- | |
scores_dict : dict | |
A dictionary of all the scores with keys as the attachment id of a | |
document and values as a list of tuples of sentence id and scores for | |
that attachment id. | |
""" | |
gold_data = pd.read_excel(GOLD_STANDARD_PATH) | |
gold_data = gold_data[["attachment_id", "sentence"]] | |
sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list) | |
scores_dict = dict() | |
if attachment_id: | |
scores_dict[attachment_id] = get_sent_scores( | |
"", | |
summary_len=SUMMARY_LEN, | |
cleaned_license_sentences=sent_lists[attachment_id] | |
) | |
return scores_dict | |
for attachment_id, cleaned_license_sentences in dict(sent_lists).items(): | |
scores_dict[attachment_id] = get_sent_scores( | |
"", | |
summary_len=SUMMARY_LEN, | |
cleaned_license_sentences=cleaned_license_sentences | |
) | |
return scores_dict | |
def preprocess_properties(cell): | |
""" | |
Converts licnse properties to title case and removes hyphens and | |
underscores. | |
Parameters | |
---------- | |
cell : str | |
A cell string in properties dataframe of a license. | |
Returns | |
------- | |
cell : TYPE | |
DESCRIPTION. | |
""" | |
try: | |
cell = cell.replace("--", "$") | |
cell = cell.replace("-", " ") | |
cell = cell.replace("_", " ") | |
cell = cell.replace("$", " - ").title() | |
except: | |
pass | |
return cell | |
def get_labels_for_license(license_id, by_license_id=True): | |
""" | |
Gets license properties for a given license_id. | |
Parameters | |
---------- | |
license_id : str | |
License id of the license for which properties are to be returned. | |
by_license_id : bool, optional | |
A flag to decide whether we fetch the license properties by license id | |
or license name. The default is True. | |
Returns | |
------- | |
properties : pandas.DataFrame | |
Dataframe with properties of the license with id license_id. | |
""" | |
index_col = 0 if by_license_id else 1 | |
columns = ["Property", "Label"] | |
labels_data = pd.read_csv(LABELS_PATH, index_col=index_col) | |
properties = pd.DataFrame(labels_data.loc[license_id]).reset_index() | |
properties.columns = columns | |
properties = properties.applymap(preprocess_properties) | |
return properties |