Spaces:
Runtime error
Runtime error
| import re | |
| import json | |
| from bs4 import BeautifulSoup | |
| from striprtf.striprtf import rtf_to_text | |
| from collections import defaultdict | |
| PARA_BREAK = "para___break" | |
| seperator = "=" * 50 | |
| verbosity = 0 | |
| def extract_author_details(text, verbosity=0): | |
| """ | |
| Extracts important author information from the license text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| text : str | |
| License text with author details removed. | |
| author_details : list | |
| A list of important author details. | |
| """ | |
| author_details_pattern = r"(@(author|license|copyright|package).*)" | |
| author_details = list() | |
| text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text) | |
| if author_details and verbosity != 0: | |
| print(seperator) | |
| print(seperator) | |
| print("Following author details were extracted:") | |
| print(seperator) | |
| print(author_details) | |
| print() | |
| return text, author_details | |
| def php_cleaner(text): | |
| """ | |
| Cleans the license file in PHP format. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with PHP script removed. | |
| """ | |
| try: | |
| return re.findall("\/\*[\S\s]*?\*\/", text)[0] | |
| except: | |
| return "" | |
| # return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] | |
| def html_cleaner(text): | |
| """ | |
| Cleans the license file in HTML format. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with HTML script removed. | |
| """ | |
| soup = BeautifulSoup(text, features="html.parser") | |
| text = soup.body.text | |
| if not text: | |
| return "" | |
| return text | |
| def json_cleaner(text_dict): | |
| """ | |
| Cleans the license file in JSON format. | |
| Parameters | |
| ---------- | |
| text_dict : dict | |
| Dictonary as read from Raw License file. | |
| Returns | |
| ------- | |
| text : str | |
| Cleaned License text with JSON format normalized to text. | |
| """ | |
| text = "" | |
| for key in text_dict.keys(): | |
| if key in ("description", "license"): | |
| text += key | |
| text += ": " | |
| text += str(text_dict[key]) | |
| text += ", " | |
| return text | |
| def rtf_cleaner(text): | |
| """ | |
| Cleans the license file in RTF format. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with RTF script removed. | |
| """ | |
| return rtf_to_text(text) | |
| def url_cleaner(text): | |
| """ | |
| Removes URLs from the License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with URLs removed. | |
| """ | |
| return re.sub(r"\(?http\S+\)?", "", text) | |
| def email_cleaner(text): | |
| """ | |
| Removes emails from the License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with emails removed. | |
| """ | |
| return re.sub(r"[\w\._-]+@\w{2,}\.\w+", "", text) | |
| def var_cleaner(text): | |
| """ | |
| Removes potential variable names from the License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with variable names removed. | |
| """ | |
| text = re.sub(r"\$\w+", "", text) | |
| text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text) | |
| # text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text) | |
| return text | |
| def character_cleaner(text): | |
| """ | |
| Removes unnecessary special characters from the License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| text : str | |
| Cleaned License text with some special characters removed. | |
| """ | |
| text = text.replace(PARA_BREAK, f" {PARA_BREAK} ") | |
| text = url_cleaner(text) | |
| text = text.replace(f" {PARA_BREAK} ", PARA_BREAK) | |
| text = email_cleaner(text) | |
| text = var_cleaner(text) | |
| text = re.sub("\s*(;quot;|&)\s*", " ", text) | |
| text = re.sub("[\n]{2,}", ". ", text) | |
| text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text) | |
| text = re.sub("[\. ]{2,}", ". ", text) | |
| html_strs = [ | |
| "’", | |
| "“", | |
| "·", | |
| "±", | |
| "…", | |
| "‚", | |
| "—", | |
| "'", | |
| "™", | |
| "‡", | |
| "•", | |
| "«", | |
| "′", | |
| """, | |
| "‘", | |
| "≈", | |
| "″", | |
| "½", | |
| "§", | |
| "£", | |
| "¢", | |
| "¶", | |
| "»", | |
| "†", | |
| "”", | |
| "€", | |
| "©", | |
| "„", | |
| "–", | |
| "°", | |
| "®", | |
| "<", | |
| ">", | |
| "≤", | |
| "≥", | |
| "≠" | |
| ] | |
| for html_str in html_strs: | |
| text = re.sub(html_str, "", text) | |
| return text | |
| def isEnglish(s): | |
| """ | |
| Checks whether the License text is in English or not. | |
| Parameters | |
| ---------- | |
| s : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| bool | |
| True if complete License text is in English, False otherwise. | |
| """ | |
| try: | |
| s.encode(encoding="utf-8").decode("ascii") | |
| except UnicodeDecodeError: | |
| return False | |
| else: | |
| return True | |
| def split_definitions_exceptions(text, remove_exceptions, verbosity=0): | |
| """ | |
| Extract definitions from the License text | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| remove_exceptions : bool | |
| True if we want to remove exceptions from the License text, False | |
| otherwise | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| paras : list | |
| A list of paragraphs from License text with definitions and exceptions | |
| removed. | |
| definitions : str | |
| Definitions extracted from the License text. | |
| exceptions : list | |
| A list of paragraphs which contain exceptions . | |
| """ | |
| definitions = "" | |
| if "Definitions" in text: | |
| try: | |
| def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions" | |
| after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+" | |
| def_pos = re.search(def_pattern, text).span() | |
| other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0] | |
| definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n" | |
| text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:] | |
| except: | |
| pass | |
| paras, more_defs = extract_relevant_paras( | |
| split_paras(text, verbosity=verbosity), | |
| verbosity=verbosity | |
| ) | |
| definitions += more_defs.strip() | |
| definitions = "\n\n".join(split_paras(definitions, verbosity=verbosity)) | |
| paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity) | |
| return paras, definitions, exceptions | |
| def discard_text_after_end_tnc(text): | |
| """ | |
| Discards text after "END OF TERMS AND CONDITIONS" | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed. | |
| """ | |
| return text.split("END OF TERMS AND CONDITIONS")[0] | |
| def clear_preamble(text): | |
| """ | |
| Cleans Preamble from the License text | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| text : str | |
| License text with Preamble removed. | |
| """ | |
| preamble_pattern = "Preamble" | |
| dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?" | |
| if preamble_pattern in text: | |
| preamble_split = text.split(preamble_pattern) | |
| if len(preamble_split) != 2: | |
| return text | |
| try: | |
| after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1] | |
| if len(preamble_split[0]) > 100: | |
| text = preamble_split[0] + after_preamble_end.strip() | |
| except: | |
| pass | |
| return text | |
| def gnu_cleaner(text): | |
| """ | |
| Cleans GNU text such as discarding Preamble and text after end of terms | |
| and conditions. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| preamble_cleared_text : str | |
| License text with irrelavant information in Preamble and text after end | |
| of terms and conditions removed. | |
| """ | |
| before_end_tnc = discard_text_after_end_tnc(text) | |
| preamble_cleared_text = clear_preamble(before_end_tnc) | |
| return preamble_cleared_text | |
| def preprocess_text(text): | |
| """ | |
| Preprocesses License text considering different License types. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| text : str | |
| License text with irrelavant information in Preamble and text after end | |
| of terms and conditions removed. | |
| """ | |
| if "GNU" in text or "Apache" in text: | |
| text = gnu_cleaner(text) | |
| return text | |
| def clean_if_else(text): | |
| """ | |
| Removes specific if-else conditions from the License text | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with if-else conditions removed. | |
| """ | |
| return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip() | |
| def clean_comments(text): | |
| """ | |
| Cleans specific comment formats from the License texts | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text with comments conditions removed. | |
| """ | |
| return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip() | |
| def script_cleaner(text): | |
| """ | |
| Cleans the script text from License text to extract the main content. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| Cleaned License text without scripts. | |
| """ | |
| try: | |
| if "<?php" in text: | |
| text = php_cleaner(text) | |
| elif "</html>" in text: | |
| text = html_cleaner(text) | |
| elif "\\rtf" in text: | |
| text = rtf_cleaner(text) | |
| elif text[0] == "{" and text[-1] == "}": | |
| text = json_cleaner(json.loads(text)) | |
| except: | |
| pass | |
| if not text: | |
| return "" | |
| text = clean_if_else(text) | |
| text = clean_comments(text) | |
| return text | |
| def split_paras(text, verbosity=0): | |
| """ | |
| Splits the text into paragraphs. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| paras : list | |
| A list of split paragraphs. | |
| """ | |
| text = re.sub(r"\n{4,}", "\n"*4, text) | |
| if len(re.findall("\n\n\n\n", text)) >= 2: | |
| paras = text.split("\n\n\n\n") | |
| paras = [re.sub(r"\n{1,3}", " ", para) for para in paras] | |
| elif len(re.findall("\n\n", text)) >= 2: | |
| paras = text.split("\n\n") | |
| paras = [re.sub(r"\n", " ", para) for para in paras] | |
| elif len(re.findall("\n", text)) >= 2: | |
| paras = text.split("\n") | |
| else: | |
| paras = [text] | |
| paras = [para.strip() for para in paras] | |
| if verbosity != 0: | |
| print(seperator) | |
| print(seperator) | |
| print("These are the split paras in the text:") | |
| for para in paras: | |
| if not para.strip(): | |
| continue | |
| print(seperator) | |
| print(para) | |
| print() | |
| return paras | |
| def extract_relevant_paras(paras, verbosity=0): | |
| """ | |
| Extracts relevant paragraphs from the list of all paragraphs. | |
| Parameters | |
| ---------- | |
| paras : list | |
| A list of split paragraphs. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| cleaned_paras : list | |
| A list of relevant paragraphs. | |
| definitions : str | |
| Definition text as extracted by the "clean_definitions_pattern", which | |
| is to be appended to other definitons in the License text if any. | |
| """ | |
| cleaned_paras = list() | |
| definitions = "" | |
| clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?""" | |
| if verbosity != 0: | |
| print(seperator) | |
| print(seperator) | |
| print("Following paragraphs were considered unnecessary and removed:") | |
| for para in paras: | |
| if not para.strip(): | |
| continue | |
| if re.search(clean_definitions_pattern, para): | |
| definitions += para + "\n\n" | |
| if verbosity != 0: | |
| print(seperator) | |
| print(para) | |
| else: | |
| cleaned_paras.append(para) | |
| if verbosity != 0: | |
| print() | |
| definitions = definitions.strip() | |
| return cleaned_paras, definitions | |
| def get_all_caps(text, verbosity=0): | |
| """ | |
| Extracts text with all caps content from the License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| text : str | |
| License text with all caps sentences removed. | |
| all_caps : list | |
| A list of all caps sentences from the License text. | |
| """ | |
| all_caps_pattern = r"([^a-z\n]{50,})" | |
| all_caps = list() | |
| text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| if all_caps and verbosity != 0: | |
| print(seperator) | |
| print(seperator) | |
| print("Following all caps were removed from the text:") | |
| print(all_caps) | |
| print() | |
| return text, all_caps | |
| def get_exeptions(paras, remove_exceptions, verbosity=0): | |
| """ | |
| Extracts a list of exceptions from the License text. | |
| Parameters | |
| ---------- | |
| paras : list | |
| A list of paragraphs from the License text. | |
| remove_exceptions : bool | |
| Toggles whether or not to remove exceptions from the cleaned license | |
| text before summarization. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| non_exception_paras : list | |
| A list of all paragraphs not containing exceptions from the License text. | |
| exceptions : list | |
| A list of all paragraphs containing exceptions from the License text. | |
| """ | |
| non_exception_paras = list() | |
| exceptions = list() | |
| for para in paras: | |
| if re.search("exception", para.lower()): | |
| exceptions.append(para) | |
| if not remove_exceptions: | |
| non_exception_paras.append(para) | |
| else: | |
| non_exception_paras.append(para) | |
| if exceptions and verbosity != 0: | |
| print(seperator) | |
| print(seperator) | |
| print("Following exceptions were found in the text:") | |
| for exception in exceptions: | |
| print(seperator) | |
| print(exception) | |
| print() | |
| return non_exception_paras, exceptions | |
| def get_MIT_content(text): | |
| """ | |
| Returns the content of the MIT-like-licenses segregated into categories like | |
| Copyright, main content, etc. | |
| Parameters | |
| ---------- | |
| text : str | |
| Cleaned MIT License text. | |
| Returns | |
| ------- | |
| dictionary | |
| A dictionary of content from the MIT license. Keys are the type of | |
| content and values are the License contents from License text. | |
| """ | |
| paras = split_paras(text) | |
| mit_content = defaultdict(list) | |
| for para in paras: | |
| para = para.strip() | |
| if len(para) < 1: | |
| continue | |
| if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para: | |
| mit_content["header"].append(para) | |
| elif "Copyright" in para: | |
| if "is hereby granted" in para: | |
| mit_content["copyright+content"].append(para) | |
| else: | |
| mit_content["copyright"].append(para) | |
| elif "Permission is hereby granted" in para: | |
| mit_content["content"].append(para) | |
| elif "The above copyright notice" in para or len(para.split()) < 18: | |
| mit_content["sentence"].append(para) | |
| elif get_all_caps(para)[1]: | |
| mit_content["all_cap"].append(para) | |
| else: | |
| mit_content["content"].append(para) | |
| for key, value in mit_content.items(): | |
| mit_content[key] = "\n\n".join(value) | |
| return mit_content | |
| def get_most_likely_license_type(text): | |
| """ | |
| Returns the most likely license type based on Doc2Vec scores | |
| (similarity > 0.9). | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| Returns | |
| ------- | |
| str | |
| The type of the most likely license. "Not found" if no license score is | |
| above 0.9 | |
| """ | |
| try: | |
| from src.doc2vec import inference | |
| except: | |
| from doc2vec import inference | |
| top1_result = inference(text).loc[0, :] | |
| if top1_result["Scores"] > 0.9: | |
| return top1_result["License"] | |
| else: | |
| return "Not Found" | |
| def clean_license_text(text, remove_exceptions=False, verbosity=0): | |
| """ | |
| Cleans License text. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw License text. | |
| remove_exceptions : bool, optional | |
| Toggles whether or not to remove exceptions from the cleaned license. | |
| The default is False. | |
| verbosity : int, optional | |
| The level of print statements on the output console. The default is 0. | |
| Returns | |
| ------- | |
| text : str | |
| Cleaned License text. | |
| definitions : str | |
| Definitions extracted from the License text. | |
| exceptions : str | |
| Exceptions extracted from the License text. | |
| """ | |
| if len(text) == 0: | |
| return text, "", "" | |
| text, author_details = extract_author_details(text, verbosity=verbosity) | |
| text = script_cleaner(text) | |
| text = preprocess_text(text) | |
| paras, definitions, exceptions = split_definitions_exceptions( | |
| text, remove_exceptions, verbosity=verbosity | |
| ) | |
| text = PARA_BREAK.join(paras) | |
| text = character_cleaner(text) | |
| text = re.sub(PARA_BREAK, "\n\n", text) | |
| text = text.strip() | |
| if not isEnglish(text): | |
| if not isEnglish(" ".join(text.split()[-5:-1])): | |
| return "", "", "" | |
| exceptions = " ".join(exceptions) | |
| return text, definitions, exceptions | |