import streamlit as st import os from streamlit_option_menu import option_menu import pandas as pd import plotly.express as px from plotly.subplots import make_subplots import plotly.graph_objects as go from streamlit_ace import st_ace from streamlit_pandas_profiling import st_profile_report import pandas_profiling def set_data_files_session_object(file_name, file_path): if 'data_files' not in st.session_state: files_dictionary = {} files_dictionary[file_name] = file_path st.session_state['data_files'] = files_dictionary else: files_dictionary = st.session_state['data_files'] files_dictionary[file_name] = file_path st.session_state['data_files'] = files_dictionary def set_filtered_data_session_object(df, file_name): if 'filtered_data' not in st.session_state: filtered_data_dictionary = {} filtered_data_dictionary[file_name] = df st.session_state['filtered_data'] = filtered_data_dictionary else: filtered_data_dictionary = st.session_state['filtered_data'] filtered_data_dictionary[file_name] = df st.session_state['filtered_data'] = filtered_data_dictionary def set_dataframe_session_object(file_name, file_path): if 'data_frames' not in st.session_state: data_frame_dictionary = {} data_frame_dictionary[file_name] = pd.read_csv(file_path) st.session_state['data_frames'] = data_frame_dictionary else: data_frame_dictionary = st.session_state['data_frames'] data_frame_dictionary[file_name] = pd.read_csv(file_path) st.session_state['data_frames'] = data_frame_dictionary def save_file(file_object): file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name) with open(file_path, "wb") as f: f.write(file_object.getbuffer()) set_data_files_session_object(file_object.name, file_path) set_dataframe_session_object(file_object.name, file_path) def create_upload_file_component(): uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'], accept_multiple_files=True) if uploaded_files: os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True) for uploaded_file in uploaded_files: save_file(uploaded_file) def create_component_to_add_target_func(selected_files, dfs, i): target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i)) # content = st_ace(language="python") # if content: code= "def f1(x): return str(x * 3)" exec(code) st.write(f1(3)) # st.write(len(content.splitlines())) # exec(content) # code= "def f1(x): return str(x * 3)" # exec(code) # st.text(content) # st.write(f1(3)) def set_filtered_data(df,selected_files,i): action = "data_filter" col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values, key= action + "_col_filter_" + str(i)) filter_operation = st.selectbox("Operation ", ['Greater Than', 'Equals', 'Less Than', "In", "In Between"], key=action + "_col_filter_op_" + str(i)) selected_filter_vals = None if filter_operation: if filter_operation == 'In': selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(), key=action + "_col_filter_val_" + str(i)) if selected_filter_vals: filtered_df = df[df[col_to_filter].isin(selected_filter_vals)] elif filter_operation == 'Equals': selected_filter_vals = st.text_input("Enter a numeric value", key=action + "_col_filter_val_" + str(i)) if selected_filter_vals: filtered_df = df[df[col_to_filter] == selected_filter_vals] elif filter_operation == 'Greater Than': selected_filter_vals = st.text_input("Enter a numeric value", key=action + "_col_filter_val_" + str(i)) if selected_filter_vals: filtered_df = df[df[col_to_filter] > selected_filter_vals] elif filter_operation == 'Less Than': selected_filter_vals = st.text_input("Enter a numeric value", key=action + "_col_filter_val_" + str(i)) if selected_filter_vals: filtered_df = df[df[col_to_filter] < selected_filter_vals] elif filter_operation == 'In Between': selected_filter_vals = st.select_slider("Select range", (df[col_to_filter].min(), df[col_to_filter].max()), key=action + "_col_filter_val_" + str(i)) if selected_filter_vals: filtered_df = df[df[col_to_filter] < selected_filter_vals] if selected_filter_vals: set_filtered_data_session_object(filtered_df,selected_files[i]) st.write('data filtered',st.session_state['filtered_data'][selected_files[i]].shape) # st.write(df.shape) # st.write( st.session_state['filtered_data'][selected_files[i]].shape) def create_component_for_analysis_for_single_df(selected_files, dfs, i): st.subheader(selected_files[i]) df = dfs[selected_files[i]] filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i)) if filter_data: set_filtered_data(df,selected_files,i) analysis_actions = st.multiselect("What analysis do you wish to do?", ['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis', 'Bivariate Analysis'], key='analysis_action_' + str(i)) if analysis_actions: df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df for action in analysis_actions: if action == 'Sample Data': clear_chart_type_session_var() st.write(df_for_analysis.sample(10)) elif action == 'Get Profile': clear_chart_type_session_var() full_data_check = st.checkbox("Report on all columns",key="filter_data_check"+str(i)) if full_data_check: st.warning("This might take a lot of time to generate the report depending on the size of the data.Select a subset of columns") confirm_full_run = st.button("Run on full data") if confirm_full_run: pr = df_for_analysis.profile_report() st_profile_report(pr) else: col_subset = st.multiselect("Select subset of columns", df.columns.values,key='filter_subset_'+ str(i)) if col_subset: pr = df_for_analysis[col_subset].profile_report() st_profile_report(pr) elif action == 'Summary of Data': clear_chart_type_session_var() st.write(df_for_analysis.describe()) elif action == 'Univariate Analysis': clear_chart_type_session_var() cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values) for col in cols_for_analysis: if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 : fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col)) st.plotly_chart(fig, use_container_width=True) elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10: value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index() value_dist_df.columns = [col,'% Distribution'] value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index() value_dist_df_counts.columns = [col,'Count'] value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col) trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)')) trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2') fig = make_subplots(specs=[[{"secondary_y": True}]]) fig.add_trace(trace1) fig.add_trace(trace2,secondary_y=True) fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90)) # fig.update_layout(height=200, width=400, title_text=f"{col} data distribution") st.plotly_chart(fig, use_container_width=True) elif action == "Bivariate Analysis": add_chart_options_to_sidebar() create_for_bivariate_analysis(selected_files, df, i) def clear_chart_type_session_var(): if 'chart_type' in st.session_state: del st.session_state[chart_type] def add_chart_options_to_sidebar(): if 'chart_type' not in st.session_state : with st.sidebar: viz_type = st.radio("Graph Type",('None','Cross Tab','Pivot Table','Box Plot')) if viz_type and viz_type != 'None': st.session_state['chart_type'] == viz_type def create_for_bivariate_analysis(selected_files, df, i): target_column = st.selectbox("Select the target column ", df.columns.values, key= "bivariate_target_column_" + str(i)) bivariate_columns = st.multiselect("Select the columns to analyse ", df.columns.values, key= "bivariate_analysis_columns_" + str(i)) col_vals = [] if bivariate_columns: for col in bivariate_columns: col_vals.append(df[col]) if st.session_state['chart_type'] == 'Cross Tab': if len(col_vals) > 3 : st.warning("Too many columns to split on. Please consider reducing the no of columns") crosstab_df = pd.crosstab(df[target_column], col_vals, margins=True) st.write(crosstab_df.to_html(),unsafe_allow_html=True) # 3 any other aggregation function can be used based on column type def create_component_for_data_analysis(): if 'data_files' in st.session_state: selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys()) if selected_files: cols = st.columns(len(selected_files)) dfs = {} for selected_file in selected_files: if selected_file in st.session_state['data_frames']: dfs[selected_file] = st.session_state['data_frames'][selected_file] else: st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file]) dfs[selected_file] = st.session_state['data_frames'][selected_file] for i, col in enumerate(cols): with col: create_component_for_analysis_for_single_df(selected_files, dfs, i) else: st.write("Upload a file to start analysis") def main(): st.title("Model Results Analyzer") with st.sidebar: selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data","Iframe"], icons=['house', 'cloud-upload', "list-task", 'gear'], menu_icon="cast", default_index=0, orientation="vertical", styles={ "container": {"padding": "0!important", "background-color": "#fafafa"}, "icon": {"color": "orange", "font-size": "15px"}, "nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px", "--hover-color": "#eee"}, "nav-link-selected": {"background-color": "green"}, }) if selected_menu == "Home": st.markdown('**This is to analyse models performance.**') elif selected_menu == "Upload Data": create_upload_file_component() if 'data_files' in st.session_state: st.write(pd.DataFrame( data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index})) elif selected_menu == "Analyze Data": create_component_for_data_analysis() elif selected_menu == "Add Features": if 'data_files' in st.session_state: selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys()) if selected_file: df = st.session_state['data_frames'][selected_file] st.header("Enter the function definiton to create a new feature") feature_name = st.text_input("Enter the New Feature Name") st.warning("please retain the function signature as 'add_feature(row)'") content = st_ace(language="python",value="def add_feature(row):") if content != 'def add_feature(row):': exec(content) df[feature_name] = df.apply(lambda x:add_feature(x),axis=1) st.session_state['data_frames'][selected_file] = df st.write(df.columns.values) elif selected_menu == "Iframe": # st.components.v1.iframe("https://huggingface.co/spaces/Sasidhar/information-extraction-demo", width=None, height=None, scrolling=False) st.components.v1.iframe("https://docs.streamlit.io/en/latest", width=None, height=None, scrolling=False) main()