Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Nov 28, 2023

Commit

53dc0ac

•

1 Parent(s): cad56b0

Migrated the databrowser code into the repo for hugging face.

Browse files

Files changed (4) hide show

pages/020_Data_Browser.py +114 -0
requirements.txt +2 -0
src/common.py +4 -0
src/datatypes.py +164 -0

pages/020_Data_Browser.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from src.st_helpers import st_setup
+from src.datatypes import *
+if st_setup('LLM Arch'):
+    if not DataLoader.loaded:
+        DataLoader.load_data()
+    summary = st.container()
+    with summary:
+        sumcol1, sumcol2 = st.columns([1, 3])
+        with sumcol1:
+            dbs = DataLoader.available_dbs()
+            if len(dbs) > 1:
+                st.write(f'**:1234: Database Selector**')
+                idx = dbs.index(DataLoader.active_db)
+                DataLoader.set_db_name(st.selectbox("Connected to:", dbs, index=idx, label_visibility="collapsed"))
+            st.write(f'**:1234: Summary Statistics**')
+            summary = f'- **{len(Category.all):,}** categories'
+            summary += f'\n- **{len(Product.all):,}** products'
+            summary += f'\n- **{len(Feature.all):,}** features'
+            summary += f'\n- **{len(Review.all):,}** reviews'
+            st.markdown(summary)
+            if st.button('Force data reload'):
+                DataLoader.load_data(True)
+                st.rerun()
+        with sumcol2:
+            cats = Category.all_sorted()
+            with st.expander("**Review Counts**"):
+                category_names = [c.name for c in cats]
+                category_review_counts = [sum([p.review_count for p in c.products]) for c in cats]
+                data = zip(category_names, category_review_counts)
+                df = pd.DataFrame(data, columns=["Category", "Review Count"])
+                st.bar_chart(df, x="Category", y="Review Count")
+            with st.expander("**Product Ratings**"):
+                data = []
+                for c in cats:
+                    for p in c.products:
+                        data.append([c.name, p.average_rating])
+                df = pd.DataFrame(data, columns=['Category', 'Mean Product Rating'])
+                fig = px.box(df, x="Category", y="Mean Product Rating")
+                fig.update_xaxes(tickangle=-90)
+                st.plotly_chart(fig, use_container_width=True)
+            with st.expander("**Product Prices**"):
+                data = []
+                for c in cats:
+                    for p in c.products:
+                        data.append([c.name, p.price])
+                df = pd.DataFrame(data, columns=['Category', 'Price'])
+                fig = px.box(df, x="Category", y="Price")
+                fig.update_xaxes(tickangle=-90)
+                st.plotly_chart(fig, use_container_width=True)
+    subhead = st.container()
+    col1, col2 = st.columns([1, 3])
+    with col1:
+        st.write('**Category**')
+        cats = Category.all_sorted()
+        options = [f"{c.name}" for c in cats]
+        selection = st.radio("**Category**", options, label_visibility="collapsed")
+    selected_category = Category.by_name(selection)
+    with subhead:
+        st.write(f'### {selection}')
+    with col2:
+        features = [f.name for f in selected_category.features]
+        features.sort()
+        st.write(f"**{selection[:-1]} Features ({len(features)}):**")
+        st.write('; '.join(features))
+        prod_index = [p.id for p in selected_category.products]
+        prod_data = [[p.name, p.price, p.feature_count, ', '.join([str(f) for f in p.features]), p.review_count,
+                      p.average_rating, False, p.description] for p in selected_category.products]
+        prod_columns = ['Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Show Reviews?',
+                        'Description']
+        prod_df = pd.DataFrame(prod_data, index=prod_index, columns=prod_columns)
+        total_reviews = sum([p.review_count for p in selected_category.products])
+        st.write(f"**{selection} ({len(prod_index)}). Having {total_reviews} reviews in total:**")
+        edited_df = st.data_editor(prod_df, disabled=(
+        'Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Description'))
+        selected_product_count = edited_df['Show Reviews?'].sum()
+        selected_review_count = edited_df[edited_df['Show Reviews?']]['Review Count'].sum()
+        st.write(f"**{selection[:-1]} Reviews ({selected_review_count} from {selected_product_count} products):**")
+        if selected_review_count > 0:
+            selected_products = list(edited_df[edited_df['Show Reviews?']].index)
+            products = Product.for_ids(selected_products)
+            rev_data = []
+            rev_index = []
+            for p in products:
+                for r in p.reviews:
+                    rev_index.append(r.id)
+                    rev_data.append([p.name, r.rating, r.review_text])
+            rev_columns = ['Product', 'Review Rating', 'Review Text']
+            rev_df = pd.DataFrame(rev_data, index=rev_index, columns=rev_columns)
+            st.dataframe(rev_df, width=10000)
+        else:
+            st.write("Check boxes in the table above to see reviews for products.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pandas==2.1.1
2	+ plotly==5.17.0

src/common.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ import os
2	+
3	+
4	+ data_dir = os.path.join(os.path.dirname(__file__), '..', 'data')

src/datatypes.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import sqlite3
+from typing import List
+from src.common import *
+class DataLoader:
+    active_db = "all_products"
+    db_file = os.path.join(data_dir, 'sqlite', f"{active_db}.db")
+    loaded = False
+    @classmethod
+    def set_db_name(cls, name: str):
+        if name != cls.active_db:
+            new_file = os.path.join(data_dir, 'sqlite', f"{name}.db")
+            print(f"Switching database file from {cls.db_file} to {new_file}")
+            cls.db_file = os.path.join(data_dir, f"{name}.db")
+            DataLoader.load_data(reload=True)
+            cls.active_db = name
+    @staticmethod
+    def current_db() -> str:
+        return [f[:-3] for f in os.listdir(data_dir) if f.startswith('products') and f.endswith('.db')]
+    @staticmethod
+    def available_dbs() -> List[str]:
+        return [f[:-3] for f in os.listdir(data_dir) if f.startswith('products') and f.endswith('.db')]
+    @staticmethod
+    def load_data(reload=False):
+        if DataLoader.loaded and not reload:
+            return
+        # Wipe out any prior data
+        Review.all = {}
+        Feature.all = {}
+        Product.all = {}
+        Category.all = {}
+        print(f"Loading {DataLoader.db_file}")
+        con = sqlite3.connect(DataLoader.db_file)
+        cur = con.cursor()
+        categories = cur.execute('SELECT * FROM categories').fetchall()
+        for c in categories:
+            Category.all[c[0]] = Category(c[0], c[1])
+        features = cur.execute('SELECT * FROM features').fetchall()
+        for f in features:
+            feat = Feature(f[0], f[1], Category.all[f[2]])
+            Feature.all[f[0]] = feat
+            Category.all[f[2]].features.append(feat)
+        products = cur.execute('SELECT * FROM products').fetchall()
+        for p in products:
+            prod = Product(p[0], p[1], p[2], p[3], Category.all[p[4]])
+            Product.all[p[0]] = prod
+            Category.all[p[4]].products.append(prod)
+        prod_feats = cur.execute('SELECT * FROM product_features').fetchall()
+        for pf in prod_feats:
+            Product.all[pf[1]].features.append(Feature.all[pf[2]])
+            Feature.all[pf[2]].products.append(Product.all[pf[1]])
+        reviews = cur.execute('SELECT * FROM reviews').fetchall()
+        for r in reviews:
+            rev = Review(r[0], r[2], r[3], Product.all[r[1]])
+            Review.all[r[0]] = rev
+            Product.all[r[1]].reviews.append(rev)
+        print("Data loaded")
+        DataLoader.loaded = True
+class Category:
+    all = {}
+    @staticmethod
+    def all_sorted():
+        all_cats = list(Category.all.values())
+        all_cats.sort(key=lambda x: x.name)
+        return all_cats
+    @staticmethod
+    def by_name(name: str):
+        all_cats = list(Category.all.values())
+        for c in all_cats:
+            if c.name == name:
+                return c
+    def __init__(self, id, name):
+        self.id = id
+        self.name = name
+        self.features = []
+        self.products = []
+    @property
+    def feature_count(self):
+        return len(self.features)
+    @property
+    def product_count(self):
+        return len(self.products)
+class Feature:
+    all = {}
+    def __init__(self, id, name, category):
+        self.id = id
+        self.name = name
+        self.category = category
+        self.products = []
+    @property
+    def product_count(self):
+        return len(self.products)
+    def __repr__(self):
+        return self.name
+class Product:
+    all = {}
+    def __init__(self, id, name, description, price, category):
+        self.id = id
+        self.name = name
+        self.description = description
+        self.price = round(price, 2)
+        self.category = category
+        self.features = []
+        self.reviews = []
+    @property
+    def feature_count(self):
+        return len(self.features)
+    @property
+    def review_count(self):
+        return len(self.reviews)
+    @property
+    def average_rating(self, decimals=2):
+        if self.review_count == 0:
+            return 0.0
+        return float(round(sum([r.rating for r in self.reviews]) / self.review_count, decimals))
+    @staticmethod
+    def for_ids(ids: List[str]):
+        return[Product.all[i] for i in ids]
+class Review:
+    all = {}
+    def __init__(self, id, rating, review_text, product):
+        self.id = id
+        self.rating = rating
+        self.review_text = review_text
+        self.product = product
+if __name__ == "__main__":
+    DataLoader.load_data()
+    print('test')