explore dataframe filtering
Browse files- appStore/target.py +101 -2
appStore/target.py
CHANGED
@@ -16,6 +16,11 @@ from utils.config import get_classifier_params
|
|
16 |
from io import BytesIO
|
17 |
import xlsxwriter
|
18 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# Declare all the necessary variables
|
21 |
classifier_identifier = 'target'
|
@@ -132,8 +137,12 @@ def target_display():
|
|
132 |
# st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
|
133 |
# hits = hits.reset_index(drop =True)
|
134 |
st.write('----------------')
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
137 |
df_xlsx = to_excel(df,hits)
|
138 |
|
139 |
with st.sidebar:
|
@@ -142,6 +151,96 @@ def target_display():
|
|
142 |
data=df_xlsx ,
|
143 |
file_name= os.path.splitext(os.path.basename(st.session_state['filename']))[0]+'.xlsx')
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
# else:
|
146 |
# st.info("🤔 No Targets found")
|
147 |
# count_df = df['Target Label'].value_counts()
|
|
|
16 |
from io import BytesIO
|
17 |
import xlsxwriter
|
18 |
import plotly.express as px
|
19 |
+
from pandas.api.types import (
|
20 |
+
is_categorical_dtype,
|
21 |
+
is_datetime64_any_dtype,
|
22 |
+
is_numeric_dtype,
|
23 |
+
is_object_dtype,)
|
24 |
|
25 |
# Declare all the necessary variables
|
26 |
classifier_identifier = 'target'
|
|
|
137 |
# st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
|
138 |
# hits = hits.reset_index(drop =True)
|
139 |
st.write('----------------')
|
140 |
+
|
141 |
+
|
142 |
+
st.title("Auto Filter Dataframes in Streamlit")
|
143 |
+
st.dataframe(filter_dataframe(hits))
|
144 |
+
# st.write('Explore the data')
|
145 |
+
# AgGrid(hits)
|
146 |
df_xlsx = to_excel(df,hits)
|
147 |
|
148 |
with st.sidebar:
|
|
|
151 |
data=df_xlsx ,
|
152 |
file_name= os.path.splitext(os.path.basename(st.session_state['filename']))[0]+'.xlsx')
|
153 |
|
154 |
+
# st.write(
|
155 |
+
# """This app accomodates the blog [here](https://blog.streamlit.io/auto-generate-a-dataframe-filtering-ui-in-streamlit-with-filter_dataframe/)
|
156 |
+
# and walks you through one example of how the Streamlit
|
157 |
+
# Data Science Team builds add-on functions to Streamlit.
|
158 |
+
# """
|
159 |
+
# )
|
160 |
+
|
161 |
+
|
162 |
+
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
163 |
+
"""
|
164 |
+
Adds a UI on top of a dataframe to let viewers filter columns
|
165 |
+
|
166 |
+
Args:
|
167 |
+
df (pd.DataFrame): Original dataframe
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
pd.DataFrame: Filtered dataframe
|
171 |
+
"""
|
172 |
+
modify = st.checkbox("Add filters")
|
173 |
+
|
174 |
+
if not modify:
|
175 |
+
return df
|
176 |
+
|
177 |
+
df = df.copy()
|
178 |
+
|
179 |
+
# Try to convert datetimes into a standard format (datetime, no timezone)
|
180 |
+
# for col in df.columns:
|
181 |
+
# if is_object_dtype(df[col]):
|
182 |
+
# try:
|
183 |
+
# df[col] = pd.to_datetime(df[col])
|
184 |
+
# except Exception:
|
185 |
+
# pass
|
186 |
+
|
187 |
+
# if is_datetime64_any_dtype(df[col]):
|
188 |
+
# df[col] = df[col].dt.tz_localize(None)
|
189 |
+
|
190 |
+
modification_container = st.container()
|
191 |
+
|
192 |
+
with modification_container:
|
193 |
+
to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
|
194 |
+
for column in to_filter_columns:
|
195 |
+
left, right = st.columns((1, 20))
|
196 |
+
left.write("↳")
|
197 |
+
# Treat columns with < 10 unique values as categorical
|
198 |
+
if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
|
199 |
+
user_cat_input = right.multiselect(
|
200 |
+
f"Values for {column}",
|
201 |
+
df[column].unique(),
|
202 |
+
default=list(df[column].unique()),
|
203 |
+
)
|
204 |
+
df = df[df[column].isin(user_cat_input)]
|
205 |
+
elif is_numeric_dtype(df[column]):
|
206 |
+
_min = float(df[column].min())
|
207 |
+
_max = float(df[column].max())
|
208 |
+
step = (_max - _min) / 100
|
209 |
+
user_num_input = right.slider(
|
210 |
+
f"Values for {column}",
|
211 |
+
_min,
|
212 |
+
_max,
|
213 |
+
(_min, _max),
|
214 |
+
step=step,
|
215 |
+
)
|
216 |
+
df = df[df[column].between(*user_num_input)]
|
217 |
+
# elif is_datetime64_any_dtype(df[column]):
|
218 |
+
# user_date_input = right.date_input(
|
219 |
+
# f"Values for {column}",
|
220 |
+
# value=(
|
221 |
+
# df[column].min(),
|
222 |
+
# df[column].max(),
|
223 |
+
# ),
|
224 |
+
# )
|
225 |
+
# if len(user_date_input) == 2:
|
226 |
+
# user_date_input = tuple(map(pd.to_datetime, user_date_input))
|
227 |
+
# start_date, end_date = user_date_input
|
228 |
+
# df = df.loc[df[column].between(start_date, end_date)]
|
229 |
+
else:
|
230 |
+
user_text_input = right.text_input(
|
231 |
+
f"Substring or regex in {column}",
|
232 |
+
)
|
233 |
+
if user_text_input:
|
234 |
+
df = df[df[column].str.contains(user_text_input)]
|
235 |
+
|
236 |
+
return df
|
237 |
+
|
238 |
+
|
239 |
+
# df = pd.read_csv(
|
240 |
+
# "https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins.csv"
|
241 |
+
# )
|
242 |
+
|
243 |
+
|
244 |
# else:
|
245 |
# st.info("🤔 No Targets found")
|
246 |
# count_df = df['Target Label'].value_counts()
|