Nihal D'Souza
Final app release
e41b03f
import numpy as np
import pandas as pd
import glob
from collections import defaultdict
data_directory = "data/"
gold_licenses_data = "data/gold_licenses/"
index_col = "license_name"
COMM_USE = "commercial-use"
DIST = "distribution"
MODS = "modifications"
PAT_USE = "patent-use"
PVT_USE = "private-use"
DISC_SRC = "disclose-source"
INCL_CPYRT = "include-copyright"
INCL_CPYRT_SRC = "include-copyright--source"
NW_USE_DISC = "network-use-disclose"
SAME_LIC = "same-license"
SAME_LIC_FILE = "same-license--file"
SAME_LIC_LIB = "same-license--library"
DOC_CHNG = "document-changes"
LIABILITY = "liability"
TRDMRK_USE = "trademark-use"
WRNTY = "warranty"
PERMISSIONS = "permissions"
CONDITIONS = "conditions"
LIMITATIONS = "limitations"
SUMMARY = "summary"
summary_terms_dict = {
COMM_USE: "The licensed material and derivatives may be used for commercial purposes.",
DIST: "The licensed material may be distributed.",
MODS: "The licensed material may be modified.",
PAT_USE: {
PERMISSIONS: "This license provides an express grant of patent rights from contributors.",
LIMITATIONS: "This license explicitly states that it does NOT grant any rights in the patents of contributors."
},
PVT_USE: "The licensed material may be used and modified in private.",
DISC_SRC: "Source code must be made available when the licensed material is distributed.",
INCL_CPYRT: "A copy of the license and copyright notice must be included with the licensed material.",
INCL_CPYRT_SRC: "A copy of the license and copyright notice must be included with the licensed material in source form, but is not required for binaries.",
NW_USE_DISC: "Users who interact with the licensed material via network are given the right to receive a copy of the source code.",
SAME_LIC: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
SAME_LIC_FILE: "Modifications of existing files must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
SAME_LIC_LIB: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used, or this condition may not apply to works that use the licensed material as a library.",
DOC_CHNG: "Changes made to the licensed material must be documented.",
LIABILITY: "This license includes a limitation of liability.",
TRDMRK_USE: "This license explicitly states that it does NOT grant trademark rights, even though licenses without such a statement probably do not grant any implicit trademark rights.",
WRNTY: "This license explicitly states that it does NOT provide any warranty."
}
def read_file(file_path):
"""
Reads data from the given file path
Parameters
----------
file_path : str
Path of file from where data is to be read.
Returns
-------
content : str
Data read from the file at given file_path.
"""
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
return content
def augment_summary(license_data):
"""
Augments summary with description of labels from choosealicense as stored
in summary_terms_dict object.
Parameters
----------
license_data : pandas.DataFrame
The license data with original license texts and summaries.
Returns
-------
license_data : pandas.DataFrame
The license data with original license texts and augmented summaries.
"""
for index, row in license_data.iterrows():
row[SUMMARY] = row[SUMMARY].strip()
if row[COMM_USE] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[COMM_USE]}"
if row[DIST] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[DIST]}"
if row[MODS] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[MODS]}"
if row[PAT_USE] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[PAT_USE][PERMISSIONS]}"
elif row[PAT_USE] == LIMITATIONS:
row[SUMMARY] += f" {summary_terms_dict[PAT_USE][LIMITATIONS]}"
if row[PVT_USE] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[PVT_USE]}"
if row[DISC_SRC] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[DISC_SRC]}"
if row[INCL_CPYRT] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT]}"
if row[INCL_CPYRT_SRC] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT_SRC]}"
if row[NW_USE_DISC] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[NW_USE_DISC]}"
if row[SAME_LIC] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC]}"
if row[SAME_LIC_FILE] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_FILE]}"
if row[SAME_LIC_LIB] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_LIB]}"
if row[DOC_CHNG] == CONDITIONS:
row[SUMMARY] += f" {summary_terms_dict[DOC_CHNG]}"
if row[LIABILITY] == LIMITATIONS:
row[SUMMARY] += f" {summary_terms_dict[LIABILITY]}"
if row[TRDMRK_USE] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[TRDMRK_USE]}"
if row[WRNTY] == PERMISSIONS:
row[SUMMARY] += f" {summary_terms_dict[WRNTY]}"
return license_data
def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False):
"""
Reads data from Text and Summary File and stores it as a dictionary of
dictionaries.
Parameters
----------
labels_file : str, optional
The filename containing labels for choosealicense licenses.
The default is "choosealicense_appendix_labels.csv".
drop_summary : bool, optional
A flag to decide whether or not to drop summary before returning the
data. The default is False.
Returns
-------
merged_data : pandas.DataFrame
Merged data with choosealicense licenses, summaries and labels.
"""
files = glob.glob(gold_licenses_data + "*")
if not files:
files = glob.glob(f"../{gold_licenses_data}" + "*")
if not files:
print("Gold licenses not found, please check the path again!")
return None
data_dict = defaultdict(dict)
for file_path in files:
if "\\" in file_path:
split_by = "\\"
else:
split_by = "/"
if file_path.endswith(".summary"):
file_name = file_path.split(split_by)[-1][:-8]
data_dict[file_name]["summary"] = read_file(file_path)
elif file_path.endswith(".txt"):
file_name = file_path.split(split_by)[-1][:-4]
data_dict[file_name]["text"] = read_file(file_path)
summary_df = pd.DataFrame(data_dict).T
try:
labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
except:
try:
labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col)
except:
print("Labels file not found, please check the path again!")
return None
# TODO: Check if this is breaking anything
merged_data = labels_df.join(summary_df)#.drop(columns=["spdx_id"])
if drop_summary:
merged_data = merged_data.drop(columns=["summary"])
return merged_data
def read_license_summary_data(aug_summary=False):
"""
Reads the licenses and summaries from license files, augments summaries
if required and return all data as a single object.
Parameters
----------
aug_summary : bool, optional
A flag to decide whether or not to augment summaries with label
descriptions before returning the data. The default is False.
Returns
-------
license_data : pandas.DataFrame
License data with choosealicense licenses and summaries.
"""
license_data = read_license_data()
if aug_summary:
license_data = augment_summary(license_data)
license_data = license_data[["text", "summary"]]
return license_data
def read_license_labels_data():
"""
Reads licenses along with their labels.
Returns
-------
pandas.DataFrame
A dataframe with Licenses and their labels.
"""
return read_license_data().drop(columns=["summary"])
def read_license_text_data(license_name):
"""
Reads License data for a given license_name
Parameters
----------
license_name : str
Name of a license from choosealicense list.
Returns
-------
str
License text for the given license_name.
"""
license_diff = read_license_data(drop_summary=True)
return license_diff[license_diff["spdx_id"] == license_name]["text"].values[0]
def fix_labels(license_data):
"""
Update labels in the given dataframe license_data with their integer ids.
Parameters
----------
license_data : pandas.DataFrame
Dataframe consisting of license data with labels.
Returns
-------
license_data : pandas.DataFrame
Dataframe consisting of license data with updated labels.
"""
permissions_map = {
"permissions": 0
}
conditions_map = {
np.nan: 0,
"conditions": 1
}
limitations_map = {
np.nan: 0,
"limitations": 1
}
permissions_limitations_map = {
np.nan: 0,
"permissions": 1,
"limitations": 2
}
# permissive_not_permissive_map = {
# np.nan: 0,
# "permissive": 1,
# "not_permissive": 2
# }
permissions_columns = [
"commercial-use",
"distribution",
"modifications",
"private-use"
]
conditions_columns = [
"disclose-source",
"document-changes",
"include-copyright",
"include-copyright--source",
"network-use-disclose",
"same-license",
"same-license--file",
"same-license--library"
]
limitations_columns = [
"liability",
"trademark-use",
"warranty"
]
permissions_limitations_columns = [
"patent-use"
]
# permissive_not_permissive_columns = [
# "GTLC_Permissive"
# ]
license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
# license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)
return license_data