File size: 4,985 Bytes
ffaff91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import pandas as pd
import numpy as np
from fuson_plm.utils.logging import log_update
def clean_rows_and_cols(df: pd.Series) -> pd.Series:
"""
Deletes empty rows and columns
Args:
df (pd.Series): input DatFrame to be cleaned
Returns:
pd.Series: cleaned DataFrame
"""
# Delete rows with no data
log_update(f"\trow cleaning...\n\t\toriginal # rows: {len(df)}")
log_update("\t\tdropping rows where all entries are np.nan...")
df = df.dropna(how='all')
log_update(f"\t\tnew # rows: {len(df)}")
# Delete columns with no data
log_update(f"\tcolumn cleaning...\n\t\toriginal # columns: {len(df.columns)}")
log_update("\t\tdropping columns where all entries are np.nan...")
df = df.dropna(axis=1,how='all')
log_update(f"\t\tnew # columns: {len(df.columns)}")
log_update(f"\t\tcolumn names: {','.join(list(df.columns))}")
return df
def check_columns_for_listlike(df: pd.DataFrame, cols_of_interest: list, delimiters: set):
"""
Checks if a column contains any listlike items
Args:
df (pd.DataFrame): DataFrame to be investigated
cols_of_interest (list): columns in df to be investigated for list-containing potential
delimiters (set): set of potential delimiting strings to search for. A column with any of these strings is considered listlike.
Returns:
dict: dictionary containing a set {} of all delimiters found in each column
e.g., { 'col1': {',',';'},
'col2': {'|'} }
"""
# return the delimiters/listlike things found for each column
return_dict = {}
log_update("\tchecking if any of our columns of interest look listlike (contain list objects or delimiters)...")
for col in cols_of_interest:
unique_col = list(df[col].value_counts().index)
listlike = any([check_item_for_listlike(x, delimiters) for x in unique_col])
if listlike:
found_delims = df[col].apply(lambda x: check_item_for_listlike(x, delimiters)).value_counts().reset_index()['index'].to_list()
unique_found_delims = set()
for x in found_delims:
unique_found_delims = unique_found_delims.union(x)
return_dict[col] = unique_found_delims
else:
return_dict[col] = False
# display the return dict
log_update(f"\t\tcolumn name: {col}\tlistlike: {return_dict[col]}")
return return_dict
def check_item_for_listlike(x, delimiters: set):
"""
Checks if a column looks like it contains a list of items, rather than an inidvidual item, based on string delimiters.
Args:
x: the item to check. Any dtype.
delimiters: a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}
Returns:
If x is a string: the set (may be empty) of delimiters contained in the string
If x is not a string: the dtype of x
"""
if isinstance(x, str):
return find_delimiters(x, delimiters)
else:
if x is None:
# if it's None, it's not listlike, it's just empty. return {} because it has no delimiters.
return {}
if type(x)==float:
# if it's nan, it's not listlike, it's just empty. return {} because it has no delimiters.
if np.isnan(x):
return {}
return type(x)
def find_delimiters(seq: str, delimiters: set) -> set:
"""
Find and return a set of delimiters in a sequence. Helper mtehod for check_item_for_listlike.
Args:
seq (str): The sequence you wish to search for invalid characters.
delimiters (set): a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}
Returns:
set: A set of characters in the sequence that are not in the set of valid characters.
"""
unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
overlap = delimiters.intersection(unique_chars)
if len(overlap)==0:
return {}
else:
return overlap
def find_invalid_chars(seq: str, valid_chars: set) -> set:
"""
Find and return a set of invalid characters in a sequence.
Args:
seq (str): The sequence you wish to search for invalid characters.
valid_chars (set): A set of valid characters.
Returns:
set: A set of characters in the sequence that are not in the set of valid characters.
"""
unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
if unique_chars.issubset(valid_chars): # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
return ''
else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
return unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X} |