llm-arch / pages /020_Data_Browser.py
alfraser's picture
Added page level comments to describe the purpose of each page
4f07f72
"""
This page allows users to browse the test data set. Mainly this is just to get a sense of the size,
content and composition of the dataset behind the project.
"""
import streamlit as st
import pandas as pd
import plotly.express as px
from src.st_helpers import st_setup
from src.datatypes import *
def show_db_selector_and_summary_in_container(container) -> None:
with container:
dbs = DataLoader.available_dbs()
if len(dbs) > 1:
st.write(f'**:1234: Database Selector**')
idx = dbs.index(DataLoader.active_db)
DataLoader.set_db_name(st.selectbox("Connected to:", dbs, index=idx, label_visibility="collapsed"))
st.write(f'**:1234: Summary Statistics**')
summary = f'- **{len(Category.all):,}** categories'
summary += f'\n- **{len(Product.all):,}** products'
summary += f'\n- **{len(Feature.all):,}** features'
summary += f'\n- **{len(Review.all):,}** reviews'
st.markdown(summary)
if st.button('Force data reload'):
DataLoader.load_data(True)
st.rerun()
def show_data_summary_charts_in_container(container) -> None:
with container:
cats = Category.all_sorted()
with st.expander("**Review Counts**"):
category_names = [c.name for c in cats]
category_review_counts = [sum([p.review_count for p in c.products]) for c in cats]
data = zip(category_names, category_review_counts)
df = pd.DataFrame(data, columns=["Category", "Review Count"])
st.bar_chart(df, x="Category", y="Review Count")
with st.expander("**Product Ratings**"):
data = []
for c in cats:
for p in c.products:
data.append([c.name, p.average_rating])
df = pd.DataFrame(data, columns=['Category', 'Mean Product Rating'])
fig = px.box(df, x="Category", y="Mean Product Rating")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
with st.expander("**Product Prices**"):
data = []
for c in cats:
for p in c.products:
data.append([c.name, p.price])
df = pd.DataFrame(data, columns=['Category', 'Price'])
fig = px.box(df, x="Category", y="Price")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_top_section() -> None:
"""
Writes the top section to the streamlit page, showing the currently selected database
and some associated summary numbers and charts
"""
top_section = st.container()
with top_section:
summary_left, summary_right = st.columns([1, 3])
show_db_selector_and_summary_in_container(summary_left)
show_data_summary_charts_in_container(summary_right)
def get_user_selected_category(container) -> Category:
"""
Show a selector to pick a category and return the selected category
"""
with container:
st.write('**Category**')
cats = Category.all_sorted()
options = [f"{c.name}" for c in cats]
selection = st.radio("**Category**", options, label_visibility="collapsed")
return Category.by_name(selection)
def show_category_datatable_in_container(category, container) -> None:
"""
Displays a category of products into a given container. Shows the products themselvses
as a table and then if any products are ticked to show the reviews it displays the associated reviews
also.
"""
with container:
features = [f.name for f in category.features]
features.sort()
st.write(f"**{category.singular_name} Features ({len(features)}):**")
st.write('; '.join(features))
prod_index = [p.id for p in category.products]
prod_data = [[p.name, p.price, p.feature_count, ', '.join([str(f) for f in p.features]), p.review_count,
p.average_rating, False, p.description] for p in category.products]
prod_columns = ['Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Show Reviews?',
'Description']
prod_df = pd.DataFrame(prod_data, index=prod_index, columns=prod_columns)
total_reviews = sum([p.review_count for p in category.products])
st.write(f"**{category.name} ({len(prod_index)}). Having {total_reviews} reviews in total:**")
edited_df = st.data_editor(prod_df, disabled=(
'Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Description'))
selected_product_count = edited_df['Show Reviews?'].sum()
selected_review_count = edited_df[edited_df['Show Reviews?']]['Review Count'].sum()
st.write(f"**{category.singular_name} Reviews ({selected_review_count} from {selected_product_count} products):**")
if selected_review_count > 0:
selected_products = list(edited_df[edited_df['Show Reviews?']].index)
products = Product.for_ids(selected_products)
rev_data = []
rev_index = []
for p in products:
for r in p.reviews:
rev_index.append(r.id)
rev_data.append([p.name, r.rating, r.review_text])
rev_columns = ['Product', 'Review Rating', 'Review Text']
rev_df = pd.DataFrame(rev_data, index=rev_index, columns=rev_columns)
st.dataframe(rev_df, width=10000)
else:
st.write("Check boxes in the table above to see reviews for products.")
def show_bottom_section() -> None:
# Set up space
selected_category_sub_heading = st.container()
category_col, datatable_col = st.columns([1, 3])
# Display into containers
selected_category = get_user_selected_category(category_col)
with selected_category_sub_heading:
st.write(f'### {selected_category.name}')
show_category_datatable_in_container(selected_category, datatable_col)
if st_setup('LLM Arch'):
if not DataLoader.loaded:
DataLoader.load_data()
st.write("# Data Browser")
show_top_section()
show_bottom_section()