|
import pandas as pd |
|
import numpy as np |
|
from fuson_plm.utils.logging import log_update |
|
|
|
def clean_rows_and_cols(df: pd.Series) -> pd.Series: |
|
""" |
|
Deletes empty rows and columns |
|
|
|
Args: |
|
df (pd.Series): input DatFrame to be cleaned |
|
|
|
Returns: |
|
pd.Series: cleaned DataFrame |
|
""" |
|
|
|
log_update(f"\trow cleaning...\n\t\toriginal # rows: {len(df)}") |
|
log_update("\t\tdropping rows where all entries are np.nan...") |
|
df = df.dropna(how='all') |
|
log_update(f"\t\tnew # rows: {len(df)}") |
|
|
|
|
|
log_update(f"\tcolumn cleaning...\n\t\toriginal # columns: {len(df.columns)}") |
|
log_update("\t\tdropping columns where all entries are np.nan...") |
|
df = df.dropna(axis=1,how='all') |
|
log_update(f"\t\tnew # columns: {len(df.columns)}") |
|
log_update(f"\t\tcolumn names: {','.join(list(df.columns))}") |
|
|
|
return df |
|
|
|
def check_columns_for_listlike(df: pd.DataFrame, cols_of_interest: list, delimiters: set): |
|
""" |
|
Checks if a column contains any listlike items |
|
|
|
Args: |
|
df (pd.DataFrame): DataFrame to be investigated |
|
cols_of_interest (list): columns in df to be investigated for list-containing potential |
|
delimiters (set): set of potential delimiting strings to search for. A column with any of these strings is considered listlike. |
|
|
|
Returns: |
|
dict: dictionary containing a set {} of all delimiters found in each column |
|
e.g., { 'col1': {',',';'}, |
|
'col2': {'|'} } |
|
""" |
|
|
|
return_dict = {} |
|
|
|
log_update("\tchecking if any of our columns of interest look listlike (contain list objects or delimiters)...") |
|
for col in cols_of_interest: |
|
unique_col = list(df[col].value_counts().index) |
|
listlike = any([check_item_for_listlike(x, delimiters) for x in unique_col]) |
|
|
|
if listlike: |
|
found_delims = df[col].apply(lambda x: check_item_for_listlike(x, delimiters)).value_counts().reset_index()['index'].to_list() |
|
unique_found_delims = set() |
|
for x in found_delims: |
|
unique_found_delims = unique_found_delims.union(x) |
|
|
|
return_dict[col] = unique_found_delims |
|
else: |
|
return_dict[col] = False |
|
|
|
|
|
log_update(f"\t\tcolumn name: {col}\tlistlike: {return_dict[col]}") |
|
|
|
return return_dict |
|
|
|
def check_item_for_listlike(x, delimiters: set): |
|
""" |
|
Checks if a column looks like it contains a list of items, rather than an inidvidual item, based on string delimiters. |
|
|
|
Args: |
|
x: the item to check. Any dtype. |
|
delimiters: a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'} |
|
|
|
Returns: |
|
If x is a string: the set (may be empty) of delimiters contained in the string |
|
If x is not a string: the dtype of x |
|
""" |
|
if isinstance(x, str): |
|
return find_delimiters(x, delimiters) |
|
else: |
|
if x is None: |
|
|
|
return {} |
|
if type(x)==float: |
|
|
|
if np.isnan(x): |
|
return {} |
|
return type(x) |
|
|
|
def find_delimiters(seq: str, delimiters: set) -> set: |
|
""" |
|
Find and return a set of delimiters in a sequence. Helper mtehod for check_item_for_listlike. |
|
|
|
Args: |
|
seq (str): The sequence you wish to search for invalid characters. |
|
delimiters (set): a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'} |
|
|
|
Returns: |
|
set: A set of characters in the sequence that are not in the set of valid characters. |
|
""" |
|
unique_chars = set(seq) |
|
overlap = delimiters.intersection(unique_chars) |
|
|
|
if len(overlap)==0: |
|
return {} |
|
else: |
|
return overlap |
|
|
|
def find_invalid_chars(seq: str, valid_chars: set) -> set: |
|
""" |
|
Find and return a set of invalid characters in a sequence. |
|
|
|
Args: |
|
seq (str): The sequence you wish to search for invalid characters. |
|
valid_chars (set): A set of valid characters. |
|
|
|
Returns: |
|
set: A set of characters in the sequence that are not in the set of valid characters. |
|
""" |
|
unique_chars = set(seq) |
|
|
|
if unique_chars.issubset(valid_chars): |
|
return '' |
|
else: |
|
return unique_chars.difference(valid_chars) |