|
import itertools |
|
import re |
|
|
|
from numpy import array |
|
from pandas import DataFrame |
|
|
|
|
|
class SearchError(Exception): |
|
"""Search returned misaligned results.""" |
|
pass |
|
|
|
|
|
|
|
def search_columns( |
|
df: DataFrame, |
|
patterns: list, |
|
columns: list, |
|
return_as: str = "indicator_column", |
|
return_column: str = "indicator", |
|
re_flags = re.I | re.X |
|
): |
|
"""Search columns for string patterns within dataframe columns. |
|
|
|
Args: |
|
df (DataFrame): Input data in format of pandas dataframe. |
|
patterns (list): List of string patterns to input, compatible with regex. |
|
columns (list): List of column names to search for input patterns. |
|
return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column". |
|
re_flags (optional): Regex flags to use. Defaults to re.I | re.X. |
|
|
|
Raises: |
|
TypeError: Raises exception when `patterns` or `columns` parameters are not lists. |
|
ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length. |
|
ValueError: Raises exception when `return_as` parameter receives an incorrect value. |
|
|
|
Returns: |
|
DataFrame: DataFrame with "indicator" column or filtered by search terms. |
|
""" |
|
|
|
bool_list = [] |
|
|
|
|
|
if not (isinstance(patterns, list) and isinstance(columns, list)): |
|
raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.') |
|
|
|
if len(patterns) == len(columns): |
|
|
|
inputs = list(zip(patterns,columns)) |
|
|
|
|
|
for i in inputs: |
|
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags) |
|
searchbool = array([True if n is True else False for n in searchre]) |
|
bool_list.append(searchbool) |
|
|
|
elif (len(patterns) == 1) and (len(patterns) != len(columns)): |
|
|
|
inputs = list(itertools.product(patterns, columns)) |
|
|
|
|
|
for i in inputs: |
|
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags) |
|
searchbool = array([True if n is True else False for n in searchre]) |
|
bool_list.append(searchbool) |
|
|
|
else: |
|
raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.") |
|
|
|
|
|
|
|
|
|
filter_bool = array(bool_list).any(axis=0) |
|
|
|
if return_as == "indicator_column": |
|
dfResults = df.copy(deep=True) |
|
dfResults.loc[:, return_column] = 0 |
|
dfResults.loc[filter_bool, return_column] = 1 |
|
|
|
return dfResults |
|
|
|
elif return_as == "filtered_df": |
|
|
|
dfResults = df.loc[filter_bool, :].copy(deep=True) |
|
|
|
return dfResults |
|
|
|
else: |
|
raise ValueError("Incorrect input for 'return_as' parameter.") |
|
|