import numpy as np import pandas as pd import glob from collections import defaultdict data_directory = "data/" gold_licenses_data = "data/gold_licenses/" index_col = "license_name" COMM_USE = "commercial-use" DIST = "distribution" MODS = "modifications" PAT_USE = "patent-use" PVT_USE = "private-use" DISC_SRC = "disclose-source" INCL_CPYRT = "include-copyright" INCL_CPYRT_SRC = "include-copyright--source" NW_USE_DISC = "network-use-disclose" SAME_LIC = "same-license" SAME_LIC_FILE = "same-license--file" SAME_LIC_LIB = "same-license--library" DOC_CHNG = "document-changes" LIABILITY = "liability" TRDMRK_USE = "trademark-use" WRNTY = "warranty" PERMISSIONS = "permissions" CONDITIONS = "conditions" LIMITATIONS = "limitations" SUMMARY = "summary" summary_terms_dict = { COMM_USE: "The licensed material and derivatives may be used for commercial purposes.", DIST: "The licensed material may be distributed.", MODS: "The licensed material may be modified.", PAT_USE: { PERMISSIONS: "This license provides an express grant of patent rights from contributors.", LIMITATIONS: "This license explicitly states that it does NOT grant any rights in the patents of contributors." }, PVT_USE: "The licensed material may be used and modified in private.", DISC_SRC: "Source code must be made available when the licensed material is distributed.", INCL_CPYRT: "A copy of the license and copyright notice must be included with the licensed material.", INCL_CPYRT_SRC: "A copy of the license and copyright notice must be included with the licensed material in source form, but is not required for binaries.", NW_USE_DISC: "Users who interact with the licensed material via network are given the right to receive a copy of the source code.", SAME_LIC: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.", SAME_LIC_FILE: "Modifications of existing files must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.", SAME_LIC_LIB: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used, or this condition may not apply to works that use the licensed material as a library.", DOC_CHNG: "Changes made to the licensed material must be documented.", LIABILITY: "This license includes a limitation of liability.", TRDMRK_USE: "This license explicitly states that it does NOT grant trademark rights, even though licenses without such a statement probably do not grant any implicit trademark rights.", WRNTY: "This license explicitly states that it does NOT provide any warranty." } def read_file(file_path): """ Reads data from the given file path Parameters ---------- file_path : str Path of file from where data is to be read. Returns ------- content : str Data read from the file at given file_path. """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() return content def augment_summary(license_data): """ Augments summary with description of labels from choosealicense as stored in summary_terms_dict object. Parameters ---------- license_data : pandas.DataFrame The license data with original license texts and summaries. Returns ------- license_data : pandas.DataFrame The license data with original license texts and augmented summaries. """ for index, row in license_data.iterrows(): row[SUMMARY] = row[SUMMARY].strip() if row[COMM_USE] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[COMM_USE]}" if row[DIST] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[DIST]}" if row[MODS] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[MODS]}" if row[PAT_USE] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[PAT_USE][PERMISSIONS]}" elif row[PAT_USE] == LIMITATIONS: row[SUMMARY] += f" {summary_terms_dict[PAT_USE][LIMITATIONS]}" if row[PVT_USE] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[PVT_USE]}" if row[DISC_SRC] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[DISC_SRC]}" if row[INCL_CPYRT] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT]}" if row[INCL_CPYRT_SRC] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT_SRC]}" if row[NW_USE_DISC] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[NW_USE_DISC]}" if row[SAME_LIC] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[SAME_LIC]}" if row[SAME_LIC_FILE] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_FILE]}" if row[SAME_LIC_LIB] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_LIB]}" if row[DOC_CHNG] == CONDITIONS: row[SUMMARY] += f" {summary_terms_dict[DOC_CHNG]}" if row[LIABILITY] == LIMITATIONS: row[SUMMARY] += f" {summary_terms_dict[LIABILITY]}" if row[TRDMRK_USE] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[TRDMRK_USE]}" if row[WRNTY] == PERMISSIONS: row[SUMMARY] += f" {summary_terms_dict[WRNTY]}" return license_data def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False): """ Reads data from Text and Summary File and stores it as a dictionary of dictionaries. Parameters ---------- labels_file : str, optional The filename containing labels for choosealicense licenses. The default is "choosealicense_appendix_labels.csv". drop_summary : bool, optional A flag to decide whether or not to drop summary before returning the data. The default is False. Returns ------- merged_data : pandas.DataFrame Merged data with choosealicense licenses, summaries and labels. """ files = glob.glob(gold_licenses_data + "*") if not files: files = glob.glob(f"../{gold_licenses_data}" + "*") if not files: print("Gold licenses not found, please check the path again!") return None data_dict = defaultdict(dict) for file_path in files: if "\\" in file_path: split_by = "\\" else: split_by = "/" if file_path.endswith(".summary"): file_name = file_path.split(split_by)[-1][:-8] data_dict[file_name]["summary"] = read_file(file_path) elif file_path.endswith(".txt"): file_name = file_path.split(split_by)[-1][:-4] data_dict[file_name]["text"] = read_file(file_path) summary_df = pd.DataFrame(data_dict).T try: labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col) except: try: labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col) except: print("Labels file not found, please check the path again!") return None # TODO: Check if this is breaking anything merged_data = labels_df.join(summary_df)#.drop(columns=["spdx_id"]) if drop_summary: merged_data = merged_data.drop(columns=["summary"]) return merged_data def read_license_summary_data(aug_summary=False): """ Reads the licenses and summaries from license files, augments summaries if required and return all data as a single object. Parameters ---------- aug_summary : bool, optional A flag to decide whether or not to augment summaries with label descriptions before returning the data. The default is False. Returns ------- license_data : pandas.DataFrame License data with choosealicense licenses and summaries. """ license_data = read_license_data() if aug_summary: license_data = augment_summary(license_data) license_data = license_data[["text", "summary"]] return license_data def read_license_labels_data(): """ Reads licenses along with their labels. Returns ------- pandas.DataFrame A dataframe with Licenses and their labels. """ return read_license_data().drop(columns=["summary"]) def read_license_text_data(license_name): """ Reads License data for a given license_name Parameters ---------- license_name : str Name of a license from choosealicense list. Returns ------- str License text for the given license_name. """ license_diff = read_license_data(drop_summary=True) return license_diff[license_diff["spdx_id"] == license_name]["text"].values[0] def fix_labels(license_data): """ Update labels in the given dataframe license_data with their integer ids. Parameters ---------- license_data : pandas.DataFrame Dataframe consisting of license data with labels. Returns ------- license_data : pandas.DataFrame Dataframe consisting of license data with updated labels. """ permissions_map = { "permissions": 0 } conditions_map = { np.nan: 0, "conditions": 1 } limitations_map = { np.nan: 0, "limitations": 1 } permissions_limitations_map = { np.nan: 0, "permissions": 1, "limitations": 2 } # permissive_not_permissive_map = { # np.nan: 0, # "permissive": 1, # "not_permissive": 2 # } permissions_columns = [ "commercial-use", "distribution", "modifications", "private-use" ] conditions_columns = [ "disclose-source", "document-changes", "include-copyright", "include-copyright--source", "network-use-disclose", "same-license", "same-license--file", "same-license--library" ] limitations_columns = [ "liability", "trademark-use", "warranty" ] permissions_limitations_columns = [ "patent-use" ] # permissive_not_permissive_columns = [ # "GTLC_Permissive" # ] license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map) license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map) license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map) license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map) # license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map) return license_data