File size: 5,521 Bytes
d391513
 
 
 
 
4ee3c5a
d391513
 
 
 
 
cbb2e1a
 
 
 
 
d3a3a91
 
d391513
 
 
 
 
 
6912c4c
d391513
 
37ccd3e
d391513
 
 
 
 
 
 
 
 
 
37ccd3e
d391513
 
 
 
 
ef746ab
 
d391513
16621db
d391513
ef746ab
 
 
 
d3a3a91
ef746ab
d391513
 
16621db
d391513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16621db
d391513
 
 
 
 
275f86e
37ccd3e
c66e5fb
d391513
16621db
d391513
d3a3a91
c66e5fb
ef746ab
 
 
 
 
 
 
 
 
c66e5fb
ef746ab
 
c66e5fb
 
ef746ab
 
 
 
5f55710
d391513
4ee3c5a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
import pathlib
import json
import pandas as pd

st. set_page_config(layout="wide")
st.header("Time Series Preprocessing Pipeline")
st.markdown("Users can load their time-series data and select a set of transformations to prepare a training set for univariate or multivariate time-series classification.\
 Go ahead and use the sidebar on the left to upload your data files in *.json* format and start exploring and transforming it!")
col1, col2 = st.columns(2)


@st.experimental_memo
def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')

    
# Load a prepare data
file_names, file_bytes = [], []
with st.sidebar:
    files = st.file_uploader("Load files", accept_multiple_files = True)
    if files:
        file_names = [file.name for file in files]
        file_bytes = [file.getvalue() for file in files]
        st.success("Your data has been successfully loaded! 🤗")

data_dict = dict({'trial_id':[], 'pupil_dilation':[], 'baseline':[], 'rating':[]})
with st.spinner("Building base dictionary..."):
    for file_data in file_bytes:
        data = json.loads(file_data)
        for k in data:
          for i in data[k]:
            for k, v in i.items():
              data_dict[k].append(v)

df_base = pd.DataFrame()  # {'<fields>' : []})
with col1:
    if file_bytes:
        with st.spinner("Building base dataframe..."):
            df_base = pd.DataFrame.from_dict(data_dict)
            df_base["trial_id"] = df_base.trial_id.map(lambda s: "".join([c for c in s if c.isdigit()]))
            df_base["len_pupil_dilation"] = df_base.pupil_dilation.map(lambda l: len(l))
            df_base["len_baseline"] = df_base.baseline.map(lambda l: len(l))
            st.info(f"number of files: {len(file_names)}")
            if 'df_base' not in st.session_state:
                st.session_state['df_base'] = df_base
    else:
        st.caption("Upload your data using the sidebar to start :sunglasses:")

if 'df_base' in st.session_state:
    st.markdown("Your original data with some extra information about the length of the time-series fields")
    st.dataframe(st.session_state.df_base)
    
# Cleaning starts
with col1:
    if not df_base.empty:
        st.markdown("**Cleaning actions**")
        detect_blinking = st.button("I want to clean my data 🤗")
        number_of_blinks = 0
        if detect_blinking:
            # Initialization of session_state
            if 'df' not in st.session_state:
                st.session_state['df'] = df_base
            for ser in df_base['pupil_dilation']:
                for f in ser:
                    if f == 0.0:
                        number_of_blinks += 1
            
            for ser in df_base['baseline']:
                for f in ser:
                    if f == 0.0:
                        number_of_blinks += 1
            # Initialization of session_state
            if 'blinks' not in st.session_state:
                st.session_state['blinks'] = number_of_blinks
                        
        if "blinks" in st.session_state.keys():
            st.info(f"blinking values (0.0) were found in {number_of_blinks} time-steps in all your data")
            remove_blinking  = st.button("Remove blinking 🧹")
            # df in column 2
            if remove_blinking:
                df_right = st.session_state.df.copy(deep=True)
                df_right.pupil_dilation = df_right.pupil_dilation.map(lambda ser: [f for f in ser if f != 0.0])
                df_right.baseline = df_right.baseline.map(lambda ser: [f for f in ser if f != 0.0])
                st.session_state['df'] = df_right.copy(deep=True)
                st.success("Blinking values have been removed!")
                st.session_state.df_base = df_right
        elif detect_blinking and not number_of_blinks:
            st.caption("No blinking values were found in your data! ")

# Add calculated fields
if 'df' in st.session_state or 'df_right' in st.session_state:
    df_right = st.session_state.df.copy(deep=True)
    if "baseline" in list(df_right.keys()):
        st.markdown(f"A **baseline** feature has been found on your data, do you want to merge it with any of the other features in a new calculated field?")
        option = st.multiselect('Select a feature to create relative calculated feature ➕', [k for k in list(df_right.keys()) if k != 'baseline'], [[k for k in list(df_right.keys()) if k != 'baseline'][-4]])
        relative_key = f"relative_{option[0]}"
        add_relative = st.button(f"Add {relative_key}")
        if add_relative:
            baseline_mean = [sum(s)/len(s) for s in df_right['baseline']]
            df_right[relative_key] = [[field_value - baseline_mean[i] for field_value in df_right[option[0]][i]] for i in range(len(df_right))]
            st.markdown("After adding calculated fields and removing blinking values (when applied)")
            st.dataframe(df_right)
            csv = convert_df(df_right)
            if 'df_right' not in st.session_state:
                st.session_state['df_right'] = df_right
            # Save transformations to disk
            downl = st.download_button("Download CSV 💾", csv, "file.csv", "text/csv", key='download-csv')
            if downl:
                st.info("Your data has been downloaded, you can visualize and detect outliers in the 'Plotting' and 'Detect Outliers' pages on the sidebar.")

if not df_base.empty:
    with col1:
        st.warning("Consider running outlier detection to clean your data!", icon="⚠️")