File size: 3,790 Bytes
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
fe4f734
 
 
 
 
 
 
 
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import itertools
import re

from numpy import array
from pandas import DataFrame


class SearchError(Exception):
    """Search returned misaligned results."""
    pass


# Defining a function to search for string patterns within dataframe columns
def search_columns(
        df: DataFrame, 
        patterns: list, 
        columns: list, 
        return_as: str = "indicator_column", 
        return_column: str = "indicator", 
        re_flags = re.I | re.X
    ):
    """Search columns for string patterns within dataframe columns.

    Args:
        df (DataFrame): Input data in format of pandas dataframe.
        patterns (list): List of string patterns to input, compatible with regex.
        columns (list): List of column names to search for input patterns.
        return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
        re_flags (optional): Regex flags to use. Defaults to re.I | re.X.

    Raises:
        TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
        ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
        ValueError: Raises exception when `return_as` parameter receives an incorrect value.

    Returns:
        DataFrame: DataFrame with "indicator" column or filtered by search terms.
    """
    # create list object for appending boolean arrays
    bool_list = []
    
    # ensure that input patterns and columns are formatted as lists
    if not (isinstance(patterns, list) and isinstance(columns, list)):
        raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
        
    if len(patterns) == len(columns):
        # create list of inputs in format [(pattern1, column1),(pattern2, column2), ...]
        inputs = list(zip(patterns,columns))
        
        # loop over list of inputs
        for i in inputs:
            searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
            searchbool = array([True if n is True else False for n in searchre])
            bool_list.append(searchbool)
        
    elif (len(patterns) == 1) and (len(patterns) != len(columns)):
        # create list of inputs in format [(pattern, column1),(pattern, column2), ...]
        inputs = list(itertools.product(patterns, columns))
        
        # loop over list of inputs
        for i in inputs:
            searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
            searchbool = array([True if n is True else False for n in searchre])
            bool_list.append(searchbool)
           
    else:  # eg, patterns formatted as a list of len(n>1) but does not match len(columns)
        raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")

    # combine each "searchbool" array elementwise
    # we want a positive match for any column to evaluate as True
    # equivalent to (bool_list[0] | bool_list[1] | bool_list[2] | ... | bool_list[n-1])
    filter_bool = array(bool_list).any(axis=0)

    if return_as == "indicator_column":
        dfResults = df.copy(deep=True)
        dfResults.loc[:, return_column] = 0
        dfResults.loc[filter_bool, return_column] = 1
        #print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
        return dfResults
    
    elif return_as == "filtered_df":
        # filter results
        dfResults = df.loc[filter_bool, :].copy(deep=True)
        #print(f"Count {return_column}: {len(dfResults)}")
        return dfResults
    
    else:
        raise ValueError("Incorrect input for 'return_as' parameter.")