import pandas as pd import spacy import math from collections import Counter try: from src.clean import clean_license_text from src.parameters import color, vocab except: from clean import clean_license_text from parameters import color, vocab GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx" LABELS_PATH = "data/choosealicense_appendix_labels.csv" MIN_SENT_LEN = 3 SUMMARY_LEN = 0.3 nlp = spacy.load("en_core_web_sm") def normalize_sentence_counter(counter): """ Normalize sentence scores in the counter between 0 and 3 Parameters ---------- counter : dict A dictionary of scores with keys as sentence and values as raw scores. Returns ------- counter : dict A dictionary of scores with keys as sentence and values as normalized scores. """ vals = list(counter.values()) if vals: min_val = min(vals) max_val = max(vals) else: return counter for sent in counter: try: counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3) except: counter[sent] = 0 return counter def sent_tokenize_text(text, debug=False): """ Tokenize a license text into sentences Parameters ---------- text : str License text to be tokenized into sentences. debug : bool, optional Toggles debug mode. The default is False. Returns ------- tokenized_sents : list A list of tokenized sentences. """ tokenized_sents = list() paras = text.split("\n\n") for para in paras: for sent in nlp(para).sents: sent = sent.text.replace("\n", "").strip() if tokenized_sents and len(tokenized_sents[-1]) <= 30: tokenized_sents[-1] += f" {sent.strip()}" else: tokenized_sents.append(sent.strip()) try: tokenized_sents[-1] += "\n\n" except: pass if debug: print("Segmented Sentences:") print("="*20) for i, sent in enumerate(tokenized_sents): print(f"Sent {i+1}") print("-"*20) print(sent) print("-"*50) print() return tokenized_sents def lemmatize_tokens(sent): """ Lemmatize tokens into the given sentence Parameters ---------- sent : str A sentences whose tokens are to be lemmatized. Returns ------- list A list of lemmatized tokens. """ lemmas = list() nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)] for tok_i, token in enumerate(nlp_sent): if (token and token not in vocab.license_stopwords and token not in vocab.negation_words): if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words: lemmas.append(f"{nlp_sent[tok_i-1]}-{token}") elif (tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in vocab.negation_words): lemmas.append(f"{nlp_sent[tok_i-2]}-{token}") else: lemmas.append(token) return [lemma for lemma in lemmas if len(lemma) > 2] def get_license_summary_scores(license_text, min_sent_len=MIN_SENT_LEN, summary_len=SUMMARY_LEN, summary_in_text_order=True, return_summary_only=True, debug=False, cleaned_license_sentences=None): """ Get sentence scores for all the cleaned sentences in a given license_text along with other extracted details such as definitions, exceptions, etc. and the cleaned license text itself. Parameters ---------- license_text : str License text. min_sent_len : int, optional The minimum number of tokens in a sentence for it to be considered. The default is 3. summary_len : float, optional The proportion of length of the expected summary to the length of license text. The default is 0.3. summary_in_text_order : bool, optional Toggle to switch between summary in text order or in descending order by scores. The default is True. return_summary_only : bool, optional Toggle to return just the summary or entire license text with important sentences highlighted. The default is True. debug : bool, optional Toggles debug mode. The default is False. cleaned_license_sentences : list, optional A list of cleaned sentences. The default is None. Returns ------- sent_scores : dict A dictionary of sentence scores with keys as tuples of sentence and sentence id and values as their normalized scores. cleaned_license_sentences : list A list of cleaned sentences. definitions : str Definitions extracted from license text. exceptions : str Exceptions extracted from license text. summary_len : float The proportion of length of the expected summary to the length of license text. """ if not cleaned_license_sentences: cleaned_license_text, definitions, exceptions = clean_license_text(license_text) cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug) else: definitions, exceptions = "", "" sent_scores = Counter() summary_len = math.ceil(summary_len * len(cleaned_license_sentences)) if debug: print(f"summary length:{summary_len}") for sent_i, sent in enumerate(cleaned_license_sentences): if len(sent.split()) < min_sent_len: continue score = 0 lemmatized_tokens = lemmatize_tokens(sent) if debug: print("-"*50) print(f"\nOriginal Sentence = {sent}") print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}") word_count = Counter([tok for tok in lemmatized_tokens]) for prop, prop_words in vocab.properties_dict.items(): prop_score = 0 imp_words = list() for prop_word in prop_words: if prop_word in word_count.keys(): prop_score += vocab.properties_scores[prop] imp_words.append(prop_word) if debug: print(prop, "=", imp_words, "=", prop_score) score += prop_score # With normalization # sent_scores[(sent, sent_i)] = score / len(lemmatized_tokens) # Without normalization sent_scores[(sent, sent_i)] = score if debug: print(f"Sentence score: {sent_scores[(sent, sent_i)]}") print() sent_scores = normalize_sentence_counter(sent_scores) if debug: print(sent_scores) return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len def get_sent_scores(license_text, min_sent_len=MIN_SENT_LEN, summary_len=SUMMARY_LEN, summary_in_text_order=True, return_summary_only=True, debug=False, cleaned_license_sentences=None): """ Get sentence scores for all the sentences in a given license_text along with their sentence ids. Parameters ---------- license_text : str License text. min_sent_len : int, optional The minimum number of tokens in a sentence for it to be considered. The default is 3. summary_len : float, optional The proportion of length of the expected summary to the length of license text. The default is 0.3. summary_in_text_order : bool, optional Toggle to switch between summary in text order or in descending order by scores. The default is True. return_summary_only : bool, optional Toggle to return just the summary or entire license text with important sentences highlighted. The default is True. debug : bool, optional Toggles debug mode. The default is False. cleaned_license_sentences : list, optional A list of cleaned sentences. The default is None. Returns ------- sent_id_scores : list(tuple) A list of tuples of sentence id and sentence score. """ sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores( license_text, min_sent_len=min_sent_len, summary_len=summary_len, summary_in_text_order=summary_in_text_order, return_summary_only=return_summary_only, debug=debug, cleaned_license_sentences=cleaned_license_sentences ) sent_id_scores = [ (sent_i, score) for (sent_id, sent_i), score in sent_scores.items() ] return sent_id_scores def custom_textrank_summarizer(license_text, min_sent_len=MIN_SENT_LEN, summary_len=SUMMARY_LEN, summary_in_text_order=True, return_summary_only=True, debug=False): """ Returns summary / highlighted summary, definitions and exceptions for a given license_text. Parameters ---------- license_text : str License text. min_sent_len : int, optional The minimum number of tokens in a sentence for it to be considered. The default is 3. summary_len : float, optional The proportion of length of the expected summary to the length of license text. The default is 0.3. summary_in_text_order : bool, optional Toggle to switch between summary in text order or in descending order by scores. The default is True. return_summary_only : bool, optional Toggle to return just the summary or entire license text with important sentences highlighted. The default is True. debug : bool, optional Toggles debug mode. The default is False. Returns ------- str Summary or the highlighted license text. definitions : str Definitions extracted from license text. exceptions : str Exceptions extracted from license text. """ sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores( license_text, min_sent_len=min_sent_len, summary_len=summary_len, summary_in_text_order=summary_in_text_order, return_summary_only=return_summary_only, debug=debug ) sorted_sent_scores = sent_scores.most_common()[:summary_len] if summary_in_text_order: sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1]) summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order) selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order) else: summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores) selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores) highlighted_license_text = " ".join( f"""{sent}""" if sent_i in selected_sent_ids else sent for sent_i, sent in enumerate(cleaned_license_sentences) ) if debug: print("="*50) print("License Text:") print("-"*30) print(highlighted_license_text) print("="*50) definitions = definitions.strip("\n.") + "." if return_summary_only: return summary, definitions, exceptions else: return highlighted_license_text, definitions, exceptions def get_system_scores(attachment_id=None): """ Get system sentence scores for all the sentences in all licenses in gold standard. Parameters ---------- attachment_id : str, optional The attachment id of the document for which the sentence scores are to be calculated. If None, the sentence scores for all the documents will be returned. The default is None. Returns ------- scores_dict : dict A dictionary of all the scores with keys as the attachment id of a document and values as a list of tuples of sentence id and scores for that attachment id. """ gold_data = pd.read_excel(GOLD_STANDARD_PATH) gold_data = gold_data[["attachment_id", "sentence"]] sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list) scores_dict = dict() if attachment_id: scores_dict[attachment_id] = get_sent_scores( "", summary_len=SUMMARY_LEN, cleaned_license_sentences=sent_lists[attachment_id] ) return scores_dict for attachment_id, cleaned_license_sentences in dict(sent_lists).items(): scores_dict[attachment_id] = get_sent_scores( "", summary_len=SUMMARY_LEN, cleaned_license_sentences=cleaned_license_sentences ) return scores_dict def preprocess_properties(cell): """ Converts licnse properties to title case and removes hyphens and underscores. Parameters ---------- cell : str A cell string in properties dataframe of a license. Returns ------- cell : TYPE DESCRIPTION. """ try: cell = cell.replace("--", "$") cell = cell.replace("-", " ") cell = cell.replace("_", " ") cell = cell.replace("$", " - ").title() except: pass return cell def get_labels_for_license(license_id, by_license_id=True): """ Gets license properties for a given license_id. Parameters ---------- license_id : str License id of the license for which properties are to be returned. by_license_id : bool, optional A flag to decide whether we fetch the license properties by license id or license name. The default is True. Returns ------- properties : pandas.DataFrame Dataframe with properties of the license with id license_id. """ index_col = 0 if by_license_id else 1 columns = ["Property", "Label"] labels_data = pd.read_csv(LABELS_PATH, index_col=index_col) properties = pd.DataFrame(labels_data.loc[license_id]).reset_index() properties.columns = columns properties = properties.applymap(preprocess_properties) return properties