File size: 5,736 Bytes
94bbd2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Eda_functions import format_numbers,line_plot,summary
import numpy as np
import re

def sanitize_key(key, prefix=""):
    # Use regular expressions to remove non-alphanumeric characters and spaces
    key = re.sub(r'[^a-zA-Z0-9]', '', key)
    return f"{prefix}{key}"


def check_box(options, ad_stock_value,lag_value,num_columns=4, prefix=""):
    num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows

    selected_options = []
    adstock_info = {}  # Store adstock and lag info for each selected option
    if ad_stock_value!=0:
        for row in range(num_rows):
            cols = st.columns(num_columns)
            for col in cols:
                if options:
                    option = options.pop(0)
                    key = sanitize_key(f"{option}_{row}", prefix=prefix)
                    selected = col.checkbox(option, key=key)
                    if selected:
                        selected_options.append(option)
                        
                        # Input minimum and maximum adstock values
                        adstock = col.slider('Select Adstock Range', 0.0, 1.0, ad_stock_value, step=0.05, format="%.2f",key= f"adstock_{key}" )

                        # Input minimum and maximum lag values
                        lag = col.slider('Select Lag Range', 0, 7, lag_value, step=1,key=f"lag_{key}" )

                        # Create a dictionary to store adstock and lag info for the option
                        option_info = {
                            'adstock': adstock,
                            'lag': lag}
                        # Append the dictionary to the adstock_info list
                        adstock_info[option]=option_info

                    else:adstock_info[option]={
                            'adstock': ad_stock_value,
                            'lag': lag_value}

        return selected_options, adstock_info   
    else:
        for row in range(num_rows):
            cols = st.columns(num_columns)
            for col in cols:
                if options:
                    option = options.pop(0)
                    key = sanitize_key(f"{option}_{row}", prefix=prefix)
                    selected = col.checkbox(option, key=key)
                    if selected:
                        selected_options.append(option)
                        
                        # Input minimum and maximum lag values
                        lag = col.slider('Select Lag Range', 0, 7, lag_value, step=1,key=f"lag_{key}" )

                        # dictionary to store adstock and lag info for the option
                        option_info = {
                            'lag': lag}
                        # Append the dictionary to the adstock_info list
                        adstock_info[option]=option_info

                    else:adstock_info[option]={
                            'lag': lag_value}
                        
        return selected_options, adstock_info 

def apply_lag(X, features,lag_dict):
    #lag_data=pd.DataFrame()
    for col in features:
        for lag in range(lag_dict[col]['lag'][0], lag_dict[col]['lag'][1] + 1):
            if lag>0:
                X[f'{col}_lag{lag}'] = X[col].shift(periods=lag, fill_value=0)
    return X

def apply_adstock(X, variable_name, decay):
    values = X[variable_name].values
    adstock = np.zeros(len(values))
    
    for row in range(len(values)):
        if row == 0:
            adstock[row] = values[row]
        else:
            adstock[row] = values[row] + adstock[row - 1] * decay
    
    return adstock

def top_correlated_features(df,target,media_data):
    corr_df=df.drop(target,axis=1)
    #corr_df[target]=df[target]
    #st.dataframe(corr_df)
    for i in media_data:
        #st.write(media_data[2])
        #st.dataframe(corr_df.filter(like=media_data[2]))
        d=(pd.concat([corr_df.filter(like=i),df[target]],axis=1)).corr()[target]
        d=d.sort_values(ascending=False)
        d=d.drop(target,axis=0)
        corr=pd.DataFrame({'Feature_name':d.index,"Correlation":d.values})
        corr.columns = pd.MultiIndex.from_product([[i], ['Feature_name', 'Correlation']])

    return corr

def top_correlated_features(df,variables,target):
    correlation_df=pd.DataFrame()
    for col in variables:
        d=pd.concat([df.filter(like=col),df[target]],axis=1).corr()[target]
        #st.dataframe(d)
        d=d.sort_values(ascending=False).iloc[1:]
        corr_df=pd.DataFrame({'Media_channel':d.index,'Correlation':d.values})
        corr_df.columns=pd.MultiIndex.from_tuples([(col, 'Variable'), (col, 'Correlation')])
        correlation_df=pd.concat([corr_df,correlation_df],axis=1)
    return correlation_df

def top_correlated_feature(df,variable,target):
    d=pd.concat([df.filter(like=variable),df[target]],axis=1).corr()[target]
    # st.dataframe(d)
    d=d.sort_values(ascending=False).iloc[1:]
    # st.dataframe(d)
    corr_df=pd.DataFrame({'Media_channel':d.index,'Correlation':d.values})
    corr_df['Adstock']=corr_df['Media_channel'].map(lambda x:x.split('_adst')[1] if len(x.split('_adst'))>1 else '-')
    corr_df['Lag']=corr_df['Media_channel'].map(lambda x:x.split('_lag')[1][0] if len(x.split('_lag'))>1 else '-' )
    corr_df.drop(['Correlation'],axis=1,inplace=True)
    corr_df['Correlation']=np.round(d.values,2)
    sorted_corr_df= corr_df.loc[corr_df['Correlation'].abs().sort_values(ascending=False).index]
    #corr_df.columns=pd.MultiIndex.from_tuples([(variable, 'Variable'), (variable, 'Correlation')])
    #correlation_df=pd.concat([corr_df,correlation_df],axis=1)
    return sorted_corr_df