Nihal D'Souza
Final app release
e41b03f
import re
import json
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
from collections import defaultdict
PARA_BREAK = "para___break"
seperator = "=" * 50
verbosity = 0
def extract_author_details(text, verbosity=0):
"""
Extracts important author information from the license text.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
License text with author details removed.
author_details : list
A list of important author details.
"""
author_details_pattern = r"(@(author|license|copyright|package).*)"
author_details = list()
text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text)
if author_details and verbosity != 0:
print(seperator)
print(seperator)
print("Following author details were extracted:")
print(seperator)
print(author_details)
print()
return text, author_details
def php_cleaner(text):
"""
Cleans the license file in PHP format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with PHP script removed.
"""
try:
return re.findall("\/\*[\S\s]*?\*\/", text)[0]
except:
return ""
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
def html_cleaner(text):
"""
Cleans the license file in HTML format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with HTML script removed.
"""
soup = BeautifulSoup(text, features="html.parser")
text = soup.body.text
if not text:
return ""
return text
def json_cleaner(text_dict):
"""
Cleans the license file in JSON format.
Parameters
----------
text_dict : dict
Dictonary as read from Raw License file.
Returns
-------
text : str
Cleaned License text with JSON format normalized to text.
"""
text = ""
for key in text_dict.keys():
if key in ("description", "license"):
text += key
text += ": "
text += str(text_dict[key])
text += ", "
return text
def rtf_cleaner(text):
"""
Cleans the license file in RTF format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with RTF script removed.
"""
return rtf_to_text(text)
def url_cleaner(text):
"""
Removes URLs from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with URLs removed.
"""
return re.sub(r"\(?http\S+\)?", "", text)
def email_cleaner(text):
"""
Removes emails from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with emails removed.
"""
return re.sub(r"[\w\._-]+@\w{2,}\.\w+", "", text)
def var_cleaner(text):
"""
Removes potential variable names from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with variable names removed.
"""
text = re.sub(r"\$\w+", "", text)
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
# text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text)
return text
def character_cleaner(text):
"""
Removes unnecessary special characters from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
text : str
Cleaned License text with some special characters removed.
"""
text = text.replace(PARA_BREAK, f" {PARA_BREAK} ")
text = url_cleaner(text)
text = text.replace(f" {PARA_BREAK} ", PARA_BREAK)
text = email_cleaner(text)
text = var_cleaner(text)
text = re.sub("\s*(;quot;|&amp)\s*", " ", text)
text = re.sub("[\n]{2,}", ". ", text)
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
text = re.sub("[\. ]{2,}", ". ", text)
html_strs = [
"&rsquo;",
"&ldquo;",
"&middot;",
"&plusmn;",
"&hellip;",
"&sbquo;",
"&mdash;",
"&apos;",
"&trade;",
"&Dagger;",
"&bull;",
"&laquo;",
"&prime;",
"&quot;",
"&lsquo;",
"&asymp;",
"&Prime;",
"&frac12;",
"&sect;",
"&pound;",
"&cent;",
"&para;",
"&raquo;",
"&dagger;",
"&rdquo;",
"&euro;",
"&copy;",
"&bdquo;",
"&ndash;",
"&deg;",
"&reg;",
"&lt;",
"&gt;",
"&le;",
"&ge;",
"&ne;"
]
for html_str in html_strs:
text = re.sub(html_str, "", text)
return text
def isEnglish(s):
"""
Checks whether the License text is in English or not.
Parameters
----------
s : str
Raw License text.
Returns
-------
bool
True if complete License text is in English, False otherwise.
"""
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return False
else:
return True
def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
"""
Extract definitions from the License text
Parameters
----------
text : str
Raw License text.
remove_exceptions : bool
True if we want to remove exceptions from the License text, False
otherwise
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
paras : list
A list of paragraphs from License text with definitions and exceptions
removed.
definitions : str
Definitions extracted from the License text.
exceptions : list
A list of paragraphs which contain exceptions .
"""
definitions = ""
if "Definitions" in text:
try:
def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions"
after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+"
def_pos = re.search(def_pattern, text).span()
other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0]
definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n"
text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:]
except:
pass
paras, more_defs = extract_relevant_paras(
split_paras(text, verbosity=verbosity),
verbosity=verbosity
)
definitions += more_defs.strip()
definitions = "\n\n".join(split_paras(definitions, verbosity=verbosity))
paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)
return paras, definitions, exceptions
def discard_text_after_end_tnc(text):
"""
Discards text after "END OF TERMS AND CONDITIONS"
Parameters
----------
text : str
Raw License text.
Returns
-------
str
License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed.
"""
return text.split("END OF TERMS AND CONDITIONS")[0]
def clear_preamble(text):
"""
Cleans Preamble from the License text
Parameters
----------
text : str
Raw License text.
Returns
-------
text : str
License text with Preamble removed.
"""
preamble_pattern = "Preamble"
dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?"
if preamble_pattern in text:
preamble_split = text.split(preamble_pattern)
if len(preamble_split) != 2:
return text
try:
after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1]
if len(preamble_split[0]) > 100:
text = preamble_split[0] + after_preamble_end.strip()
except:
pass
return text
def gnu_cleaner(text):
"""
Cleans GNU text such as discarding Preamble and text after end of terms
and conditions.
Parameters
----------
text : str
Raw License text.
Returns
-------
preamble_cleared_text : str
License text with irrelavant information in Preamble and text after end
of terms and conditions removed.
"""
before_end_tnc = discard_text_after_end_tnc(text)
preamble_cleared_text = clear_preamble(before_end_tnc)
return preamble_cleared_text
def preprocess_text(text):
"""
Preprocesses License text considering different License types.
Parameters
----------
text : str
Raw License text.
Returns
-------
text : str
License text with irrelavant information in Preamble and text after end
of terms and conditions removed.
"""
if "GNU" in text or "Apache" in text:
text = gnu_cleaner(text)
return text
def clean_if_else(text):
"""
Removes specific if-else conditions from the License text
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with if-else conditions removed.
"""
return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip()
def clean_comments(text):
"""
Cleans specific comment formats from the License texts
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with comments conditions removed.
"""
return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip()
def script_cleaner(text):
"""
Cleans the script text from License text to extract the main content.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text without scripts.
"""
try:
if "<?php" in text:
text = php_cleaner(text)
elif "</html>" in text:
text = html_cleaner(text)
elif "\\rtf" in text:
text = rtf_cleaner(text)
elif text[0] == "{" and text[-1] == "}":
text = json_cleaner(json.loads(text))
except:
pass
if not text:
return ""
text = clean_if_else(text)
text = clean_comments(text)
return text
def split_paras(text, verbosity=0):
"""
Splits the text into paragraphs.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
paras : list
A list of split paragraphs.
"""
text = re.sub(r"\n{4,}", "\n"*4, text)
if len(re.findall("\n\n\n\n", text)) >= 2:
paras = text.split("\n\n\n\n")
paras = [re.sub(r"\n{1,3}", " ", para) for para in paras]
elif len(re.findall("\n\n", text)) >= 2:
paras = text.split("\n\n")
paras = [re.sub(r"\n", " ", para) for para in paras]
elif len(re.findall("\n", text)) >= 2:
paras = text.split("\n")
else:
paras = [text]
paras = [para.strip() for para in paras]
if verbosity != 0:
print(seperator)
print(seperator)
print("These are the split paras in the text:")
for para in paras:
if not para.strip():
continue
print(seperator)
print(para)
print()
return paras
def extract_relevant_paras(paras, verbosity=0):
"""
Extracts relevant paragraphs from the list of all paragraphs.
Parameters
----------
paras : list
A list of split paragraphs.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
cleaned_paras : list
A list of relevant paragraphs.
definitions : str
Definition text as extracted by the "clean_definitions_pattern", which
is to be appended to other definitons in the License text if any.
"""
cleaned_paras = list()
definitions = ""
clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?"""
if verbosity != 0:
print(seperator)
print(seperator)
print("Following paragraphs were considered unnecessary and removed:")
for para in paras:
if not para.strip():
continue
if re.search(clean_definitions_pattern, para):
definitions += para + "\n\n"
if verbosity != 0:
print(seperator)
print(para)
else:
cleaned_paras.append(para)
if verbosity != 0:
print()
definitions = definitions.strip()
return cleaned_paras, definitions
def get_all_caps(text, verbosity=0):
"""
Extracts text with all caps content from the License text.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
License text with all caps sentences removed.
all_caps : list
A list of all caps sentences from the License text.
"""
all_caps_pattern = r"([^a-z\n]{50,})"
all_caps = list()
text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text)
text = re.sub(r"\n{3,}", "\n\n", text)
if all_caps and verbosity != 0:
print(seperator)
print(seperator)
print("Following all caps were removed from the text:")
print(all_caps)
print()
return text, all_caps
def get_exeptions(paras, remove_exceptions, verbosity=0):
"""
Extracts a list of exceptions from the License text.
Parameters
----------
paras : list
A list of paragraphs from the License text.
remove_exceptions : bool
Toggles whether or not to remove exceptions from the cleaned license
text before summarization.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
non_exception_paras : list
A list of all paragraphs not containing exceptions from the License text.
exceptions : list
A list of all paragraphs containing exceptions from the License text.
"""
non_exception_paras = list()
exceptions = list()
for para in paras:
if re.search("exception", para.lower()):
exceptions.append(para)
if not remove_exceptions:
non_exception_paras.append(para)
else:
non_exception_paras.append(para)
if exceptions and verbosity != 0:
print(seperator)
print(seperator)
print("Following exceptions were found in the text:")
for exception in exceptions:
print(seperator)
print(exception)
print()
return non_exception_paras, exceptions
def get_MIT_content(text):
"""
Returns the content of the MIT-like-licenses segregated into categories like
Copyright, main content, etc.
Parameters
----------
text : str
Cleaned MIT License text.
Returns
-------
dictionary
A dictionary of content from the MIT license. Keys are the type of
content and values are the License contents from License text.
"""
paras = split_paras(text)
mit_content = defaultdict(list)
for para in paras:
para = para.strip()
if len(para) < 1:
continue
if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para:
mit_content["header"].append(para)
elif "Copyright" in para:
if "is hereby granted" in para:
mit_content["copyright+content"].append(para)
else:
mit_content["copyright"].append(para)
elif "Permission is hereby granted" in para:
mit_content["content"].append(para)
elif "The above copyright notice" in para or len(para.split()) < 18:
mit_content["sentence"].append(para)
elif get_all_caps(para)[1]:
mit_content["all_cap"].append(para)
else:
mit_content["content"].append(para)
for key, value in mit_content.items():
mit_content[key] = "\n\n".join(value)
return mit_content
def get_most_likely_license_type(text):
"""
Returns the most likely license type based on Doc2Vec scores
(similarity > 0.9).
Parameters
----------
text : str
Raw License text.
Returns
-------
str
The type of the most likely license. "Not found" if no license score is
above 0.9
"""
try:
from src.doc2vec import inference
except:
from doc2vec import inference
top1_result = inference(text).loc[0, :]
if top1_result["Scores"] > 0.9:
return top1_result["License"]
else:
return "Not Found"
def clean_license_text(text, remove_exceptions=False, verbosity=0):
"""
Cleans License text.
Parameters
----------
text : str
Raw License text.
remove_exceptions : bool, optional
Toggles whether or not to remove exceptions from the cleaned license.
The default is False.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
Cleaned License text.
definitions : str
Definitions extracted from the License text.
exceptions : str
Exceptions extracted from the License text.
"""
if len(text) == 0:
return text, "", ""
text, author_details = extract_author_details(text, verbosity=verbosity)
text = script_cleaner(text)
text = preprocess_text(text)
paras, definitions, exceptions = split_definitions_exceptions(
text, remove_exceptions, verbosity=verbosity
)
text = PARA_BREAK.join(paras)
text = character_cleaner(text)
text = re.sub(PARA_BREAK, "\n\n", text)
text = text.strip()
if not isEnglish(text):
if not isEnglish(" ".join(text.split()[-5:-1])):
return "", "", ""
exceptions = " ".join(exceptions)
return text, definitions, exceptions