| import itertools
|
| import re
|
|
|
| from numpy import array
|
| from pandas import DataFrame
|
|
|
|
|
| class SearchError(Exception):
|
| """Search returned misaligned results."""
|
| pass
|
|
|
|
|
|
|
| def search_columns(df: DataFrame,
|
| patterns: list,
|
| columns: list,
|
| return_as: str = "indicator_column",
|
| return_column: str = "indicator",
|
| re_flags = re.I | re.X):
|
| """Search columns for string patterns within dataframe columns.
|
|
|
| Args:
|
| df (DataFrame): Input data in format of pandas dataframe.
|
| patterns (list): List of string patterns to input, compatible with regex.
|
| columns (list): List of column names to search for input patterns.
|
| return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
|
| re_flags (optional): Regex flags to use. Defaults to re.I | re.X.
|
|
|
| Raises:
|
| TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
|
| ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
|
| ValueError: Raises exception when `return_as` parameter receives an incorrect value.
|
|
|
| Returns:
|
| DataFrame: DataFrame with "indicator" column or filtered by search terms.
|
| """
|
|
|
| bool_list = []
|
|
|
|
|
| if not (isinstance(patterns, list) and isinstance(columns, list)):
|
| raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
|
|
|
| if len(patterns) == len(columns):
|
|
|
| inputs = list(zip(patterns,columns))
|
|
|
|
|
| for i in inputs:
|
| searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
| searchbool = array([True if n is True else False for n in searchre])
|
| bool_list.append(searchbool)
|
|
|
| elif (len(patterns) == 1) and (len(patterns) != len(columns)):
|
|
|
| inputs = list(itertools.product(patterns, columns))
|
|
|
|
|
| for i in inputs:
|
| searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
| searchbool = array([True if n is True else False for n in searchre])
|
| bool_list.append(searchbool)
|
|
|
| else:
|
| raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")
|
|
|
|
|
|
|
|
|
| filter_bool = array(bool_list).any(axis=0)
|
|
|
| if return_as == "indicator_column":
|
| dfResults = df.copy(deep=True)
|
| dfResults.loc[:, return_column] = 0
|
| dfResults.loc[filter_bool, return_column] = 1
|
|
|
| return dfResults
|
|
|
| elif return_as == "filtered_df":
|
|
|
| dfResults = df.loc[filter_bool, :].copy(deep=True)
|
|
|
| return dfResults
|
|
|
| else:
|
| raise ValueError("Incorrect input for 'return_as' parameter.")
|
|
|