Szymon Woźniak commited on
Commit
4a15609
1 Parent(s): 329333f

add language statistics, make inferring categorical for numeric types optional, make slider integer if df dtype is integer

Browse files
.gitignore CHANGED
@@ -159,4 +159,5 @@ cython_debug/
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
 
162
- *.ipynb
 
 
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
 
162
+ *.ipynb
163
+ *.code-workspace
data/language_stats.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e52fca5ff80ab2c16ba8bbd99244f7cbe5e2a988f45443fe48c1b2a176f98e9c
3
+ size 9087
filter_dataframe.py CHANGED
@@ -1,3 +1,4 @@
 
1
 
2
  import streamlit.components.v1 as components
3
  import pandas as pd
@@ -7,14 +8,16 @@ from pandas.api.types import (
7
  is_categorical_dtype,
8
  is_datetime64_any_dtype,
9
  is_numeric_dtype,
 
10
  is_object_dtype,
11
  )
12
- def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
13
  """
14
  Adds a UI on top of a dataframe to let viewers filter columns
15
 
16
  Args:
17
  df (pd.DataFrame): Original dataframe
 
18
 
19
  Returns:
20
  pd.DataFrame: Filtered dataframe
@@ -45,17 +48,35 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
45
  left, right = st.columns((1, 20))
46
  left.write("↳")
47
  # Treat columns with < 10 unique values as categorical
48
- if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  user_cat_input = right.multiselect(
50
  f"Values for {column}",
51
  df[column].unique(),
52
  default=list(df[column].unique()),
53
  )
54
  df = df[df[column].isin(user_cat_input)]
55
- elif is_numeric_dtype(df[column]):
56
- _min = float(df[column].min())
57
- _max = float(df[column].max())
58
- step = (_max - _min) / 100
 
 
 
 
 
59
  user_num_input = right.slider(
60
  f"Values for {column}",
61
  _min,
 
1
+ # https://blog.streamlit.io/auto-generate-a-dataframe-filtering-ui-in-streamlit-with-filter_dataframe/
2
 
3
  import streamlit.components.v1 as components
4
  import pandas as pd
 
8
  is_categorical_dtype,
9
  is_datetime64_any_dtype,
10
  is_numeric_dtype,
11
+ is_integer_dtype,
12
  is_object_dtype,
13
  )
14
+ def filter_dataframe(df: pd.DataFrame, numeric_as_categorical: bool = True) -> pd.DataFrame:
15
  """
16
  Adds a UI on top of a dataframe to let viewers filter columns
17
 
18
  Args:
19
  df (pd.DataFrame): Original dataframe
20
+ numeric_as_categorical (bool, optional): Whether to treat numeric columns with low number of unique values as categorical. Defaults to True.
21
 
22
  Returns:
23
  pd.DataFrame: Filtered dataframe
 
48
  left, right = st.columns((1, 20))
49
  left.write("↳")
50
  # Treat columns with < 10 unique values as categorical
51
+ low_nunique = df[column].nunique() < 10
52
+ is_categorical = is_categorical_dtype(df[column])
53
+ is_numeric = is_numeric_dtype(df[column])
54
+
55
+ treat_as_categorical = False
56
+ if is_categorical:
57
+ treat_as_categorical = True
58
+ elif low_nunique:
59
+ if is_numeric:
60
+ treat_as_categorical = numeric_as_categorical
61
+ else:
62
+ treat_as_categorical = True
63
+
64
+ if treat_as_categorical:
65
  user_cat_input = right.multiselect(
66
  f"Values for {column}",
67
  df[column].unique(),
68
  default=list(df[column].unique()),
69
  )
70
  df = df[df[column].isin(user_cat_input)]
71
+ elif is_numeric:
72
+ if is_integer_dtype(df[column]):
73
+ _min = int(df[column].min())
74
+ _max = int(df[column].max())
75
+ step = 1
76
+ else:
77
+ _min = float(df[column].min())
78
+ _max = float(df[column].max())
79
+ step = (_max - _min) / 100
80
  user_num_input = right.slider(
81
  f"Values for {column}",
82
  _min,
pages/2_Language_Statistics.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from filter_dataframe import filter_dataframe
4
+
5
+
6
+ @st.cache_data
7
+ def get_language_stats_df():
8
+ return pd.read_parquet("data/language_stats.parquet")
9
+
10
+
11
+
12
+ st.set_page_config(page_title="Language Statistics", page_icon="📈")
13
+
14
+ st.markdown("# Language Statistics")
15
+ st.sidebar.header("Language Statistics")
16
+ st.write(
17
+ """TODO: Description"""
18
+ )
19
+
20
+ df = get_language_stats_df()
21
+
22
+ st.dataframe(filter_dataframe(df, numeric_as_categorical=False))
pages/3_Dataset_Statistics.py ADDED
File without changes