File size: 3,662 Bytes
a3171a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
from utilities.template_helpers import upload_data
from types import NoneType
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


def render():
    st.title("PREPROCESSING")
    # dropna
    # fillna
    # select columns
    # scaling

    col1, col2, col3 = st.columns([1,1,1])

    df = None
    with col1.container():
        df = upload_data()
        if type(df) is NoneType:
            return
        if df.shape == (0,0):
            return
        info = pd.DataFrame()
        info['dtypes'] = pd.DataFrame(df.dtypes)
        info['null'] = df.isna().sum()

        tab1, tab2 = st.tabs(['Dataframe','Info'])
        with tab1:
            st.dataframe(df, use_container_width=True, height=300)
        with tab2:
            st.dataframe(info,use_container_width=True,height=300)

    with col2.container():
        ### DROP NA ###
        st.write('\n\n')
        st.markdown('#### Drop Null Values')
        st.write('Drop any row containing null values')
        drop_null = st.checkbox('Drop')
        if drop_null:
            df.dropna(inplace=True)

        ### FILL NA ####
        st.write("\n\n")
        st.markdown('#### Fill Null Values')
        st.write("""Replace null values with mean of the column for numerical variables,
                     and mode for categorical variables""")
        fill_null = st.checkbox('Fill')
        if fill_null:
            for col in df.columns:
                val = 0
                if df[col].dtype == 'object':
                    val = df[col].mode()
                else:
                    val = df[col].mean()
                df[col].fillna(val)

        ### SCALING ###
        st.write('\n\n')
        st.markdown("#### Scaling")
        st.write("Standardize numerical features by removing the mean and scaling to unit variance.")
        scale = st.checkbox('Scale')
        if scale:
            numerical_columns = df.select_dtypes('number').columns
            categorical_columns = df.select_dtypes('object').columns
            categorical_indexes = []

            # Scaling
            scaler = StandardScaler()
            for c in categorical_columns:
                categorical_indexes.append(df.columns.get_loc(c))
            # create a copy of our data to be scaled
            df_scale = df.copy()
            # standard scale numerical features
            for c in numerical_columns:
                df_scale[c] = scaler.fit_transform(df[[c]])
            df = df_scale


    with col3.container():
        ### SELECT COLUMNS
        st.write("\n\n")
        st.markdown("#### Choose columns")
        cols = st.multiselect('Select columns to use',options=list(df.columns),default=list(df.columns))
        #select_cols = st.button('Use selected columns')
        #if select_cols:
        df = df[cols]

        st.write("\n\n")
        st.markdown("#### Encode Numerical values")
        enc = st.checkbox('Encode')
        if enc:
            df.loc[:,df.dtypes == 'object']=df.loc[:,df.dtypes == 'object'].apply(
            lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique())))))

        st.write('\n\n')
        st.markdown("#### Download Preprocessed data")
        st.download_button("Download Results",
                            df.to_csv(index=False),
                            "preprocessed.csv",
                            "text/csv", 
                            key="download-csv")
        #st.dataframe(df)




#def res_session():
#    st.session_state['drop_na'] = False
#    st.session_state['fill_na'] = False
#    st.session_state['scale'] = False
#    st.session_state['']