Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /read_data.py

Nihal D'Souza

Final app release

e41b03f almost 2 years ago

No virus

11.1 kB

	import numpy as np
	import pandas as pd

	import glob
	from collections import defaultdict


	data_directory = "data/"
	gold_licenses_data = "data/gold_licenses/"
	index_col = "license_name"


	COMM_USE = "commercial-use"
	DIST = "distribution"
	MODS = "modifications"
	PAT_USE = "patent-use"
	PVT_USE = "private-use"
	DISC_SRC = "disclose-source"
	INCL_CPYRT = "include-copyright"
	INCL_CPYRT_SRC = "include-copyright--source"
	NW_USE_DISC = "network-use-disclose"
	SAME_LIC = "same-license"
	SAME_LIC_FILE = "same-license--file"
	SAME_LIC_LIB = "same-license--library"
	DOC_CHNG = "document-changes"
	LIABILITY = "liability"
	TRDMRK_USE = "trademark-use"
	WRNTY = "warranty"

	PERMISSIONS = "permissions"
	CONDITIONS = "conditions"
	LIMITATIONS = "limitations"

	SUMMARY = "summary"

	summary_terms_dict = {
	COMM_USE: "The licensed material and derivatives may be used for commercial purposes.",
	DIST: "The licensed material may be distributed.",
	MODS: "The licensed material may be modified.",
	PAT_USE: {
	PERMISSIONS: "This license provides an express grant of patent rights from contributors.",
	LIMITATIONS: "This license explicitly states that it does NOT grant any rights in the patents of contributors."
	},
	PVT_USE: "The licensed material may be used and modified in private.",
	DISC_SRC: "Source code must be made available when the licensed material is distributed.",
	INCL_CPYRT: "A copy of the license and copyright notice must be included with the licensed material.",
	INCL_CPYRT_SRC: "A copy of the license and copyright notice must be included with the licensed material in source form, but is not required for binaries.",
	NW_USE_DISC: "Users who interact with the licensed material via network are given the right to receive a copy of the source code.",
	SAME_LIC: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
	SAME_LIC_FILE: "Modifications of existing files must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
	SAME_LIC_LIB: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used, or this condition may not apply to works that use the licensed material as a library.",
	DOC_CHNG: "Changes made to the licensed material must be documented.",
	LIABILITY: "This license includes a limitation of liability.",
	TRDMRK_USE: "This license explicitly states that it does NOT grant trademark rights, even though licenses without such a statement probably do not grant any implicit trademark rights.",
	WRNTY: "This license explicitly states that it does NOT provide any warranty."
	}


	def read_file(file_path):
	"""
	Reads data from the given file path

	Parameters
	----------
	file_path : str
	Path of file from where data is to be read.

	Returns
	-------
	content : str
	Data read from the file at given file_path.

	"""
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()
	return content


	def augment_summary(license_data):
	"""
	Augments summary with description of labels from choosealicense as stored
	in summary_terms_dict object.

	Parameters
	----------
	license_data : pandas.DataFrame
	The license data with original license texts and summaries.

	Returns
	-------
	license_data : pandas.DataFrame
	The license data with original license texts and augmented summaries.

	"""
	for index, row in license_data.iterrows():
	row[SUMMARY] = row[SUMMARY].strip()

	if row[COMM_USE] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[COMM_USE]}"
	if row[DIST] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[DIST]}"
	if row[MODS] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[MODS]}"
	if row[PAT_USE] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[PAT_USE][PERMISSIONS]}"
	elif row[PAT_USE] == LIMITATIONS:
	row[SUMMARY] += f" {summary_terms_dict[PAT_USE][LIMITATIONS]}"
	if row[PVT_USE] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[PVT_USE]}"
	if row[DISC_SRC] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[DISC_SRC]}"
	if row[INCL_CPYRT] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT]}"
	if row[INCL_CPYRT_SRC] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT_SRC]}"
	if row[NW_USE_DISC] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[NW_USE_DISC]}"
	if row[SAME_LIC] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[SAME_LIC]}"
	if row[SAME_LIC_FILE] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_FILE]}"
	if row[SAME_LIC_LIB] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_LIB]}"
	if row[DOC_CHNG] == CONDITIONS:
	row[SUMMARY] += f" {summary_terms_dict[DOC_CHNG]}"
	if row[LIABILITY] == LIMITATIONS:
	row[SUMMARY] += f" {summary_terms_dict[LIABILITY]}"
	if row[TRDMRK_USE] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[TRDMRK_USE]}"
	if row[WRNTY] == PERMISSIONS:
	row[SUMMARY] += f" {summary_terms_dict[WRNTY]}"

	return license_data


	def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False):
	"""
	Reads data from Text and Summary File and stores it as a dictionary of
	dictionaries.

	Parameters
	----------
	labels_file : str, optional
	The filename containing labels for choosealicense licenses.
	The default is "choosealicense_appendix_labels.csv".
	drop_summary : bool, optional
	A flag to decide whether or not to drop summary before returning the
	data. The default is False.

	Returns
	-------
	merged_data : pandas.DataFrame
	Merged data with choosealicense licenses, summaries and labels.

	"""
	files = glob.glob(gold_licenses_data + "*")
	if not files:
	files = glob.glob(f"../{gold_licenses_data}" + "*")
	if not files:
	print("Gold licenses not found, please check the path again!")
	return None

	data_dict = defaultdict(dict)

	for file_path in files:
	if "\\" in file_path:
	split_by = "\\"
	else:
	split_by = "/"

	if file_path.endswith(".summary"):
	file_name = file_path.split(split_by)[-1][:-8]
	data_dict[file_name]["summary"] = read_file(file_path)
	elif file_path.endswith(".txt"):
	file_name = file_path.split(split_by)[-1][:-4]
	data_dict[file_name]["text"] = read_file(file_path)

	summary_df = pd.DataFrame(data_dict).T

	try:
	labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
	except:
	try:
	labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col)
	except:
	print("Labels file not found, please check the path again!")
	return None

	# TODO: Check if this is breaking anything
	merged_data = labels_df.join(summary_df)#.drop(columns=["spdx_id"])

	if drop_summary:
	merged_data = merged_data.drop(columns=["summary"])

	return merged_data


	def read_license_summary_data(aug_summary=False):
	"""
	Reads the licenses and summaries from license files, augments summaries
	if required and return all data as a single object.

	Parameters
	----------
	aug_summary : bool, optional
	A flag to decide whether or not to augment summaries with label
	descriptions before returning the data. The default is False.

	Returns
	-------
	license_data : pandas.DataFrame
	License data with choosealicense licenses and summaries.

	"""
	license_data = read_license_data()
	if aug_summary:
	license_data = augment_summary(license_data)
	license_data = license_data[["text", "summary"]]

	return license_data


	def read_license_labels_data():
	"""
	Reads licenses along with their labels.

	Returns
	-------
	pandas.DataFrame
	A dataframe with Licenses and their labels.

	"""
	return read_license_data().drop(columns=["summary"])


	def read_license_text_data(license_name):
	"""
	Reads License data for a given license_name

	Parameters
	----------
	license_name : str
	Name of a license from choosealicense list.

	Returns
	-------
	str
	License text for the given license_name.

	"""
	license_diff = read_license_data(drop_summary=True)
	return license_diff[license_diff["spdx_id"] == license_name]["text"].values[0]


	def fix_labels(license_data):
	"""
	Update labels in the given dataframe license_data with their integer ids.

	Parameters
	----------
	license_data : pandas.DataFrame
	Dataframe consisting of license data with labels.

	Returns
	-------
	license_data : pandas.DataFrame
	Dataframe consisting of license data with updated labels.

	"""
	permissions_map = {
	"permissions": 0
	}

	conditions_map = {
	np.nan: 0,
	"conditions": 1
	}

	limitations_map = {
	np.nan: 0,
	"limitations": 1
	}

	permissions_limitations_map = {
	np.nan: 0,
	"permissions": 1,
	"limitations": 2
	}

	# permissive_not_permissive_map = {
	# np.nan: 0,
	# "permissive": 1,
	# "not_permissive": 2
	# }

	permissions_columns = [
	"commercial-use",
	"distribution",
	"modifications",
	"private-use"
	]
	conditions_columns = [
	"disclose-source",
	"document-changes",
	"include-copyright",
	"include-copyright--source",
	"network-use-disclose",
	"same-license",
	"same-license--file",
	"same-license--library"
	]

	limitations_columns = [
	"liability",
	"trademark-use",
	"warranty"
	]

	permissions_limitations_columns = [
	"patent-use"
	]

	# permissive_not_permissive_columns = [
	# "GTLC_Permissive"
	# ]

	license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
	license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
	license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
	license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
	# license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)

	return license_data