FusOn-pLM / fuson_plm /utils /data_cleaning.py

adding utility files used throughout FusOn-pLM training and benchmarking

ffaff91 26 days ago

4.99 kB

	import pandas as pd
	import numpy as np
	from fuson_plm.utils.logging import log_update

	def clean_rows_and_cols(df: pd.Series) -> pd.Series:
	"""
	Deletes empty rows and columns

	Args:
	df (pd.Series): input DatFrame to be cleaned

	Returns:
	pd.Series: cleaned DataFrame
	"""
	# Delete rows with no data
	log_update(f"\trow cleaning...\n\t\toriginal # rows: {len(df)}")
	log_update("\t\tdropping rows where all entries are np.nan...")
	df = df.dropna(how='all')
	log_update(f"\t\tnew # rows: {len(df)}")

	# Delete columns with no data
	log_update(f"\tcolumn cleaning...\n\t\toriginal # columns: {len(df.columns)}")
	log_update("\t\tdropping columns where all entries are np.nan...")
	df = df.dropna(axis=1,how='all')
	log_update(f"\t\tnew # columns: {len(df.columns)}")
	log_update(f"\t\tcolumn names: {','.join(list(df.columns))}")

	return df

	def check_columns_for_listlike(df: pd.DataFrame, cols_of_interest: list, delimiters: set):
	"""
	Checks if a column contains any listlike items

	Args:
	df (pd.DataFrame): DataFrame to be investigated
	cols_of_interest (list): columns in df to be investigated for list-containing potential
	delimiters (set): set of potential delimiting strings to search for. A column with any of these strings is considered listlike.

	Returns:
	dict: dictionary containing a set {} of all delimiters found in each column
	e.g., { 'col1': {',',';'},
	'col2': {'\|'} }
	"""
	# return the delimiters/listlike things found for each column
	return_dict = {}

	log_update("\tchecking if any of our columns of interest look listlike (contain list objects or delimiters)...")
	for col in cols_of_interest:
	unique_col = list(df[col].value_counts().index)
	listlike = any([check_item_for_listlike(x, delimiters) for x in unique_col])

	if listlike:
	found_delims = df[col].apply(lambda x: check_item_for_listlike(x, delimiters)).value_counts().reset_index()['index'].to_list()
	unique_found_delims = set()
	for x in found_delims:
	unique_found_delims = unique_found_delims.union(x)

	return_dict[col] = unique_found_delims
	else:
	return_dict[col] = False

	# display the return dict
	log_update(f"\t\tcolumn name: {col}\tlistlike: {return_dict[col]}")

	return return_dict

	def check_item_for_listlike(x, delimiters: set):
	"""
	Checks if a column looks like it contains a list of items, rather than an inidvidual item, based on string delimiters.

	Args:
	x: the item to check. Any dtype.
	delimiters: a set of delimiters to check for. e.g., {',', ';', '\|', '\t', ' ', ':', '-', '/', '\\', '\n'}

	Returns:
	If x is a string: the set (may be empty) of delimiters contained in the string
	If x is not a string: the dtype of x
	"""
	if isinstance(x, str):
	return find_delimiters(x, delimiters)
	else:
	if x is None:
	# if it's None, it's not listlike, it's just empty. return {} because it has no delimiters.
	return {}
	if type(x)==float:
	# if it's nan, it's not listlike, it's just empty. return {} because it has no delimiters.
	if np.isnan(x):
	return {}
	return type(x)

	def find_delimiters(seq: str, delimiters: set) -> set:
	"""
	Find and return a set of delimiters in a sequence. Helper mtehod for check_item_for_listlike.

	Args:
	seq (str): The sequence you wish to search for invalid characters.
	delimiters (set): a set of delimiters to check for. e.g., {',', ';', '\|', '\t', ' ', ':', '-', '/', '\\', '\n'}

	Returns:
	set: A set of characters in the sequence that are not in the set of valid characters.
	"""
	unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
	overlap = delimiters.intersection(unique_chars)

	if len(overlap)==0:
	return {}
	else:
	return overlap

	def find_invalid_chars(seq: str, valid_chars: set) -> set:
	"""
	Find and return a set of invalid characters in a sequence.

	Args:
	seq (str): The sequence you wish to search for invalid characters.
	valid_chars (set): A set of valid characters.

	Returns:
	set: A set of characters in the sequence that are not in the set of valid characters.
	"""
	unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"

	if unique_chars.issubset(valid_chars): # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
	return ''
	else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
	return unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X}