Szymon Woźniak commited on
Commit
329333f
1 Parent(s): eef0f47

add filtering dataframe

Browse files
filter_dataframe.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit.components.v1 as components
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ from pandas.api.types import (
7
+ is_categorical_dtype,
8
+ is_datetime64_any_dtype,
9
+ is_numeric_dtype,
10
+ is_object_dtype,
11
+ )
12
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
13
+ """
14
+ Adds a UI on top of a dataframe to let viewers filter columns
15
+
16
+ Args:
17
+ df (pd.DataFrame): Original dataframe
18
+
19
+ Returns:
20
+ pd.DataFrame: Filtered dataframe
21
+ """
22
+ modify = st.checkbox("Add filters")
23
+
24
+ if not modify:
25
+ return df
26
+
27
+ df = df.copy()
28
+
29
+ # Try to convert datetimes into a standard format (datetime, no timezone)
30
+ for col in df.columns:
31
+ if is_object_dtype(df[col]):
32
+ try:
33
+ df[col] = pd.to_datetime(df[col])
34
+ except Exception:
35
+ pass
36
+
37
+ if is_datetime64_any_dtype(df[col]):
38
+ df[col] = df[col].dt.tz_localize(None)
39
+
40
+ modification_container = st.container()
41
+
42
+ with modification_container:
43
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
44
+ for column in to_filter_columns:
45
+ left, right = st.columns((1, 20))
46
+ left.write("↳")
47
+ # Treat columns with < 10 unique values as categorical
48
+ if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
49
+ user_cat_input = right.multiselect(
50
+ f"Values for {column}",
51
+ df[column].unique(),
52
+ default=list(df[column].unique()),
53
+ )
54
+ df = df[df[column].isin(user_cat_input)]
55
+ elif is_numeric_dtype(df[column]):
56
+ _min = float(df[column].min())
57
+ _max = float(df[column].max())
58
+ step = (_max - _min) / 100
59
+ user_num_input = right.slider(
60
+ f"Values for {column}",
61
+ _min,
62
+ _max,
63
+ (_min, _max),
64
+ step=step,
65
+ )
66
+ df = df[df[column].between(*user_num_input)]
67
+ elif is_datetime64_any_dtype(df[column]):
68
+ user_date_input = right.date_input(
69
+ f"Values for {column}",
70
+ value=(
71
+ df[column].min(),
72
+ df[column].max(),
73
+ ),
74
+ )
75
+ if len(user_date_input) == 2:
76
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
77
+ start_date, end_date = user_date_input
78
+ df = df.loc[df[column].between(start_date, end_date)]
79
+ else:
80
+ user_text_input = right.text_input(
81
+ f"Substring or regex in {column}",
82
+ )
83
+ if user_text_input:
84
+ df = df[df[column].str.contains(user_text_input)]
85
+
86
+ return df
pages/1_Language_Typology.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import time
3
  import numpy as np
4
  import pandas as pd
 
5
 
6
 
7
  @st.cache_data
@@ -20,4 +21,4 @@ st.write(
20
 
21
  df = get_typology_df()
22
 
23
- st.dataframe(df)
 
2
  import time
3
  import numpy as np
4
  import pandas as pd
5
+ from filter_dataframe import filter_dataframe
6
 
7
 
8
  @st.cache_data
 
21
 
22
  df = get_typology_df()
23
 
24
+ st.dataframe(filter_dataframe(df))