import re import json from bs4 import BeautifulSoup from striprtf.striprtf import rtf_to_text from collections import defaultdict PARA_BREAK = "para___break" seperator = "=" * 50 verbosity = 0 def extract_author_details(text, verbosity=0): """ Extracts important author information from the license text. Parameters ---------- text : str Raw License text. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- text : str License text with author details removed. author_details : list A list of important author details. """ author_details_pattern = r"(@(author|license|copyright|package).*)" author_details = list() text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text) if author_details and verbosity != 0: print(seperator) print(seperator) print("Following author details were extracted:") print(seperator) print(author_details) print() return text, author_details def php_cleaner(text): """ Cleans the license file in PHP format. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with PHP script removed. """ try: return re.findall("\/\*[\S\s]*?\*\/", text)[0] except: return "" # return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] def html_cleaner(text): """ Cleans the license file in HTML format. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with HTML script removed. """ soup = BeautifulSoup(text, features="html.parser") text = soup.body.text if not text: return "" return text def json_cleaner(text_dict): """ Cleans the license file in JSON format. Parameters ---------- text_dict : dict Dictonary as read from Raw License file. Returns ------- text : str Cleaned License text with JSON format normalized to text. """ text = "" for key in text_dict.keys(): if key in ("description", "license"): text += key text += ": " text += str(text_dict[key]) text += ", " return text def rtf_cleaner(text): """ Cleans the license file in RTF format. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with RTF script removed. """ return rtf_to_text(text) def url_cleaner(text): """ Removes URLs from the License text. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with URLs removed. """ return re.sub(r"\(?http\S+\)?", "", text) def email_cleaner(text): """ Removes emails from the License text. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with emails removed. """ return re.sub(r"[\w\._-]+@\w{2,}\.\w+", "", text) def var_cleaner(text): """ Removes potential variable names from the License text. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with variable names removed. """ text = re.sub(r"\$\w+", "", text) text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text) # text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text) return text def character_cleaner(text): """ Removes unnecessary special characters from the License text. Parameters ---------- text : str Raw License text. Returns ------- text : str Cleaned License text with some special characters removed. """ text = text.replace(PARA_BREAK, f" {PARA_BREAK} ") text = url_cleaner(text) text = text.replace(f" {PARA_BREAK} ", PARA_BREAK) text = email_cleaner(text) text = var_cleaner(text) text = re.sub("\s*(;quot;|&)\s*", " ", text) text = re.sub("[\n]{2,}", ". ", text) text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text) text = re.sub("[\. ]{2,}", ". ", text) html_strs = [ "’", "“", "·", "±", "…", "‚", "—", "'", "™", "‡", "•", "«", "′", """, "‘", "≈", "″", "½", "§", "£", "¢", "¶", "»", "†", "”", "€", "©", "„", "–", "°", "®", "<", ">", "≤", "≥", "≠" ] for html_str in html_strs: text = re.sub(html_str, "", text) return text def isEnglish(s): """ Checks whether the License text is in English or not. Parameters ---------- s : str Raw License text. Returns ------- bool True if complete License text is in English, False otherwise. """ try: s.encode(encoding="utf-8").decode("ascii") except UnicodeDecodeError: return False else: return True def split_definitions_exceptions(text, remove_exceptions, verbosity=0): """ Extract definitions from the License text Parameters ---------- text : str Raw License text. remove_exceptions : bool True if we want to remove exceptions from the License text, False otherwise verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- paras : list A list of paragraphs from License text with definitions and exceptions removed. definitions : str Definitions extracted from the License text. exceptions : list A list of paragraphs which contain exceptions . """ definitions = "" if "Definitions" in text: try: def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions" after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+" def_pos = re.search(def_pattern, text).span() other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0] definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n" text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:] except: pass paras, more_defs = extract_relevant_paras( split_paras(text, verbosity=verbosity), verbosity=verbosity ) definitions += more_defs.strip() definitions = "\n\n".join(split_paras(definitions, verbosity=verbosity)) paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity) return paras, definitions, exceptions def discard_text_after_end_tnc(text): """ Discards text after "END OF TERMS AND CONDITIONS" Parameters ---------- text : str Raw License text. Returns ------- str License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed. """ return text.split("END OF TERMS AND CONDITIONS")[0] def clear_preamble(text): """ Cleans Preamble from the License text Parameters ---------- text : str Raw License text. Returns ------- text : str License text with Preamble removed. """ preamble_pattern = "Preamble" dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?" if preamble_pattern in text: preamble_split = text.split(preamble_pattern) if len(preamble_split) != 2: return text try: after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1] if len(preamble_split[0]) > 100: text = preamble_split[0] + after_preamble_end.strip() except: pass return text def gnu_cleaner(text): """ Cleans GNU text such as discarding Preamble and text after end of terms and conditions. Parameters ---------- text : str Raw License text. Returns ------- preamble_cleared_text : str License text with irrelavant information in Preamble and text after end of terms and conditions removed. """ before_end_tnc = discard_text_after_end_tnc(text) preamble_cleared_text = clear_preamble(before_end_tnc) return preamble_cleared_text def preprocess_text(text): """ Preprocesses License text considering different License types. Parameters ---------- text : str Raw License text. Returns ------- text : str License text with irrelavant information in Preamble and text after end of terms and conditions removed. """ if "GNU" in text or "Apache" in text: text = gnu_cleaner(text) return text def clean_if_else(text): """ Removes specific if-else conditions from the License text Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with if-else conditions removed. """ return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip() def clean_comments(text): """ Cleans specific comment formats from the License texts Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text with comments conditions removed. """ return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip() def script_cleaner(text): """ Cleans the script text from License text to extract the main content. Parameters ---------- text : str Raw License text. Returns ------- str Cleaned License text without scripts. """ try: if "" in text: text = html_cleaner(text) elif "\\rtf" in text: text = rtf_cleaner(text) elif text[0] == "{" and text[-1] == "}": text = json_cleaner(json.loads(text)) except: pass if not text: return "" text = clean_if_else(text) text = clean_comments(text) return text def split_paras(text, verbosity=0): """ Splits the text into paragraphs. Parameters ---------- text : str Raw License text. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- paras : list A list of split paragraphs. """ text = re.sub(r"\n{4,}", "\n"*4, text) if len(re.findall("\n\n\n\n", text)) >= 2: paras = text.split("\n\n\n\n") paras = [re.sub(r"\n{1,3}", " ", para) for para in paras] elif len(re.findall("\n\n", text)) >= 2: paras = text.split("\n\n") paras = [re.sub(r"\n", " ", para) for para in paras] elif len(re.findall("\n", text)) >= 2: paras = text.split("\n") else: paras = [text] paras = [para.strip() for para in paras] if verbosity != 0: print(seperator) print(seperator) print("These are the split paras in the text:") for para in paras: if not para.strip(): continue print(seperator) print(para) print() return paras def extract_relevant_paras(paras, verbosity=0): """ Extracts relevant paragraphs from the list of all paragraphs. Parameters ---------- paras : list A list of split paragraphs. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- cleaned_paras : list A list of relevant paragraphs. definitions : str Definition text as extracted by the "clean_definitions_pattern", which is to be appended to other definitons in the License text if any. """ cleaned_paras = list() definitions = "" clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?""" if verbosity != 0: print(seperator) print(seperator) print("Following paragraphs were considered unnecessary and removed:") for para in paras: if not para.strip(): continue if re.search(clean_definitions_pattern, para): definitions += para + "\n\n" if verbosity != 0: print(seperator) print(para) else: cleaned_paras.append(para) if verbosity != 0: print() definitions = definitions.strip() return cleaned_paras, definitions def get_all_caps(text, verbosity=0): """ Extracts text with all caps content from the License text. Parameters ---------- text : str Raw License text. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- text : str License text with all caps sentences removed. all_caps : list A list of all caps sentences from the License text. """ all_caps_pattern = r"([^a-z\n]{50,})" all_caps = list() text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text) text = re.sub(r"\n{3,}", "\n\n", text) if all_caps and verbosity != 0: print(seperator) print(seperator) print("Following all caps were removed from the text:") print(all_caps) print() return text, all_caps def get_exeptions(paras, remove_exceptions, verbosity=0): """ Extracts a list of exceptions from the License text. Parameters ---------- paras : list A list of paragraphs from the License text. remove_exceptions : bool Toggles whether or not to remove exceptions from the cleaned license text before summarization. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- non_exception_paras : list A list of all paragraphs not containing exceptions from the License text. exceptions : list A list of all paragraphs containing exceptions from the License text. """ non_exception_paras = list() exceptions = list() for para in paras: if re.search("exception", para.lower()): exceptions.append(para) if not remove_exceptions: non_exception_paras.append(para) else: non_exception_paras.append(para) if exceptions and verbosity != 0: print(seperator) print(seperator) print("Following exceptions were found in the text:") for exception in exceptions: print(seperator) print(exception) print() return non_exception_paras, exceptions def get_MIT_content(text): """ Returns the content of the MIT-like-licenses segregated into categories like Copyright, main content, etc. Parameters ---------- text : str Cleaned MIT License text. Returns ------- dictionary A dictionary of content from the MIT license. Keys are the type of content and values are the License contents from License text. """ paras = split_paras(text) mit_content = defaultdict(list) for para in paras: para = para.strip() if len(para) < 1: continue if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para: mit_content["header"].append(para) elif "Copyright" in para: if "is hereby granted" in para: mit_content["copyright+content"].append(para) else: mit_content["copyright"].append(para) elif "Permission is hereby granted" in para: mit_content["content"].append(para) elif "The above copyright notice" in para or len(para.split()) < 18: mit_content["sentence"].append(para) elif get_all_caps(para)[1]: mit_content["all_cap"].append(para) else: mit_content["content"].append(para) for key, value in mit_content.items(): mit_content[key] = "\n\n".join(value) return mit_content def get_most_likely_license_type(text): """ Returns the most likely license type based on Doc2Vec scores (similarity > 0.9). Parameters ---------- text : str Raw License text. Returns ------- str The type of the most likely license. "Not found" if no license score is above 0.9 """ try: from src.doc2vec import inference except: from doc2vec import inference top1_result = inference(text).loc[0, :] if top1_result["Scores"] > 0.9: return top1_result["License"] else: return "Not Found" def clean_license_text(text, remove_exceptions=False, verbosity=0): """ Cleans License text. Parameters ---------- text : str Raw License text. remove_exceptions : bool, optional Toggles whether or not to remove exceptions from the cleaned license. The default is False. verbosity : int, optional The level of print statements on the output console. The default is 0. Returns ------- text : str Cleaned License text. definitions : str Definitions extracted from the License text. exceptions : str Exceptions extracted from the License text. """ if len(text) == 0: return text, "", "" text, author_details = extract_author_details(text, verbosity=verbosity) text = script_cleaner(text) text = preprocess_text(text) paras, definitions, exceptions = split_definitions_exceptions( text, remove_exceptions, verbosity=verbosity ) text = PARA_BREAK.join(paras) text = character_cleaner(text) text = re.sub(PARA_BREAK, "\n\n", text) text = text.strip() if not isEnglish(text): if not isEnglish(" ".join(text.split()[-5:-1])): return "", "", "" exceptions = " ".join(exceptions) return text, definitions, exceptions