alfraser commited on
Commit
53dc0ac
1 Parent(s): cad56b0

Migrated the databrowser code into the repo for hugging face.

Browse files
Files changed (4) hide show
  1. pages/020_Data_Browser.py +114 -0
  2. requirements.txt +2 -0
  3. src/common.py +4 -0
  4. src/datatypes.py +164 -0
pages/020_Data_Browser.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ from src.st_helpers import st_setup
6
+ from src.datatypes import *
7
+
8
+ if st_setup('LLM Arch'):
9
+ if not DataLoader.loaded:
10
+ DataLoader.load_data()
11
+
12
+ summary = st.container()
13
+ with summary:
14
+
15
+ sumcol1, sumcol2 = st.columns([1, 3])
16
+
17
+ with sumcol1:
18
+ dbs = DataLoader.available_dbs()
19
+ if len(dbs) > 1:
20
+ st.write(f'**:1234: Database Selector**')
21
+ idx = dbs.index(DataLoader.active_db)
22
+ DataLoader.set_db_name(st.selectbox("Connected to:", dbs, index=idx, label_visibility="collapsed"))
23
+
24
+ st.write(f'**:1234: Summary Statistics**')
25
+ summary = f'- **{len(Category.all):,}** categories'
26
+ summary += f'\n- **{len(Product.all):,}** products'
27
+ summary += f'\n- **{len(Feature.all):,}** features'
28
+ summary += f'\n- **{len(Review.all):,}** reviews'
29
+ st.markdown(summary)
30
+ if st.button('Force data reload'):
31
+ DataLoader.load_data(True)
32
+ st.rerun()
33
+
34
+ with sumcol2:
35
+ cats = Category.all_sorted()
36
+
37
+ with st.expander("**Review Counts**"):
38
+ category_names = [c.name for c in cats]
39
+ category_review_counts = [sum([p.review_count for p in c.products]) for c in cats]
40
+ data = zip(category_names, category_review_counts)
41
+ df = pd.DataFrame(data, columns=["Category", "Review Count"])
42
+ st.bar_chart(df, x="Category", y="Review Count")
43
+
44
+ with st.expander("**Product Ratings**"):
45
+ data = []
46
+ for c in cats:
47
+ for p in c.products:
48
+ data.append([c.name, p.average_rating])
49
+ df = pd.DataFrame(data, columns=['Category', 'Mean Product Rating'])
50
+ fig = px.box(df, x="Category", y="Mean Product Rating")
51
+ fig.update_xaxes(tickangle=-90)
52
+ st.plotly_chart(fig, use_container_width=True)
53
+
54
+ with st.expander("**Product Prices**"):
55
+ data = []
56
+ for c in cats:
57
+ for p in c.products:
58
+ data.append([c.name, p.price])
59
+ df = pd.DataFrame(data, columns=['Category', 'Price'])
60
+ fig = px.box(df, x="Category", y="Price")
61
+ fig.update_xaxes(tickangle=-90)
62
+ st.plotly_chart(fig, use_container_width=True)
63
+
64
+ subhead = st.container()
65
+
66
+ col1, col2 = st.columns([1, 3])
67
+
68
+ with col1:
69
+
70
+ st.write('**Category**')
71
+ cats = Category.all_sorted()
72
+ options = [f"{c.name}" for c in cats]
73
+ selection = st.radio("**Category**", options, label_visibility="collapsed")
74
+
75
+ selected_category = Category.by_name(selection)
76
+
77
+ with subhead:
78
+ st.write(f'### {selection}')
79
+
80
+ with col2:
81
+ features = [f.name for f in selected_category.features]
82
+ features.sort()
83
+ st.write(f"**{selection[:-1]} Features ({len(features)}):**")
84
+ st.write('; '.join(features))
85
+
86
+ prod_index = [p.id for p in selected_category.products]
87
+ prod_data = [[p.name, p.price, p.feature_count, ', '.join([str(f) for f in p.features]), p.review_count,
88
+ p.average_rating, False, p.description] for p in selected_category.products]
89
+ prod_columns = ['Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Show Reviews?',
90
+ 'Description']
91
+ prod_df = pd.DataFrame(prod_data, index=prod_index, columns=prod_columns)
92
+ total_reviews = sum([p.review_count for p in selected_category.products])
93
+ st.write(f"**{selection} ({len(prod_index)}). Having {total_reviews} reviews in total:**")
94
+ edited_df = st.data_editor(prod_df, disabled=(
95
+ 'Name', 'Price', 'Feature Count', 'Features', 'Review Count', 'Average Rating', 'Description'))
96
+
97
+ selected_product_count = edited_df['Show Reviews?'].sum()
98
+ selected_review_count = edited_df[edited_df['Show Reviews?']]['Review Count'].sum()
99
+
100
+ st.write(f"**{selection[:-1]} Reviews ({selected_review_count} from {selected_product_count} products):**")
101
+ if selected_review_count > 0:
102
+ selected_products = list(edited_df[edited_df['Show Reviews?']].index)
103
+ products = Product.for_ids(selected_products)
104
+ rev_data = []
105
+ rev_index = []
106
+ for p in products:
107
+ for r in p.reviews:
108
+ rev_index.append(r.id)
109
+ rev_data.append([p.name, r.rating, r.review_text])
110
+ rev_columns = ['Product', 'Review Rating', 'Review Text']
111
+ rev_df = pd.DataFrame(rev_data, index=rev_index, columns=rev_columns)
112
+ st.dataframe(rev_df, width=10000)
113
+ else:
114
+ st.write("Check boxes in the table above to see reviews for products.")
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas==2.1.1
2
+ plotly==5.17.0
src/common.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ data_dir = os.path.join(os.path.dirname(__file__), '..', 'data')
src/datatypes.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from typing import List
3
+
4
+ from src.common import *
5
+
6
+
7
+ class DataLoader:
8
+ active_db = "all_products"
9
+ db_file = os.path.join(data_dir, 'sqlite', f"{active_db}.db")
10
+ loaded = False
11
+
12
+ @classmethod
13
+ def set_db_name(cls, name: str):
14
+ if name != cls.active_db:
15
+ new_file = os.path.join(data_dir, 'sqlite', f"{name}.db")
16
+ print(f"Switching database file from {cls.db_file} to {new_file}")
17
+ cls.db_file = os.path.join(data_dir, f"{name}.db")
18
+ DataLoader.load_data(reload=True)
19
+ cls.active_db = name
20
+
21
+ @staticmethod
22
+ def current_db() -> str:
23
+ return [f[:-3] for f in os.listdir(data_dir) if f.startswith('products') and f.endswith('.db')]
24
+
25
+ @staticmethod
26
+ def available_dbs() -> List[str]:
27
+ return [f[:-3] for f in os.listdir(data_dir) if f.startswith('products') and f.endswith('.db')]
28
+
29
+ @staticmethod
30
+ def load_data(reload=False):
31
+ if DataLoader.loaded and not reload:
32
+ return
33
+
34
+ # Wipe out any prior data
35
+ Review.all = {}
36
+ Feature.all = {}
37
+ Product.all = {}
38
+ Category.all = {}
39
+
40
+ print(f"Loading {DataLoader.db_file}")
41
+ con = sqlite3.connect(DataLoader.db_file)
42
+ cur = con.cursor()
43
+
44
+ categories = cur.execute('SELECT * FROM categories').fetchall()
45
+ for c in categories:
46
+ Category.all[c[0]] = Category(c[0], c[1])
47
+
48
+ features = cur.execute('SELECT * FROM features').fetchall()
49
+ for f in features:
50
+ feat = Feature(f[0], f[1], Category.all[f[2]])
51
+ Feature.all[f[0]] = feat
52
+ Category.all[f[2]].features.append(feat)
53
+
54
+ products = cur.execute('SELECT * FROM products').fetchall()
55
+ for p in products:
56
+ prod = Product(p[0], p[1], p[2], p[3], Category.all[p[4]])
57
+ Product.all[p[0]] = prod
58
+ Category.all[p[4]].products.append(prod)
59
+
60
+ prod_feats = cur.execute('SELECT * FROM product_features').fetchall()
61
+ for pf in prod_feats:
62
+ Product.all[pf[1]].features.append(Feature.all[pf[2]])
63
+ Feature.all[pf[2]].products.append(Product.all[pf[1]])
64
+
65
+ reviews = cur.execute('SELECT * FROM reviews').fetchall()
66
+ for r in reviews:
67
+ rev = Review(r[0], r[2], r[3], Product.all[r[1]])
68
+ Review.all[r[0]] = rev
69
+ Product.all[r[1]].reviews.append(rev)
70
+
71
+ print("Data loaded")
72
+ DataLoader.loaded = True
73
+
74
+
75
+ class Category:
76
+ all = {}
77
+
78
+ @staticmethod
79
+ def all_sorted():
80
+ all_cats = list(Category.all.values())
81
+ all_cats.sort(key=lambda x: x.name)
82
+ return all_cats
83
+
84
+ @staticmethod
85
+ def by_name(name: str):
86
+ all_cats = list(Category.all.values())
87
+ for c in all_cats:
88
+ if c.name == name:
89
+ return c
90
+
91
+ def __init__(self, id, name):
92
+ self.id = id
93
+ self.name = name
94
+ self.features = []
95
+ self.products = []
96
+
97
+ @property
98
+ def feature_count(self):
99
+ return len(self.features)
100
+
101
+ @property
102
+ def product_count(self):
103
+ return len(self.products)
104
+
105
+
106
+ class Feature:
107
+ all = {}
108
+
109
+ def __init__(self, id, name, category):
110
+ self.id = id
111
+ self.name = name
112
+ self.category = category
113
+ self.products = []
114
+
115
+ @property
116
+ def product_count(self):
117
+ return len(self.products)
118
+
119
+ def __repr__(self):
120
+ return self.name
121
+
122
+ class Product:
123
+ all = {}
124
+
125
+ def __init__(self, id, name, description, price, category):
126
+ self.id = id
127
+ self.name = name
128
+ self.description = description
129
+ self.price = round(price, 2)
130
+ self.category = category
131
+ self.features = []
132
+ self.reviews = []
133
+
134
+ @property
135
+ def feature_count(self):
136
+ return len(self.features)
137
+
138
+ @property
139
+ def review_count(self):
140
+ return len(self.reviews)
141
+
142
+ @property
143
+ def average_rating(self, decimals=2):
144
+ if self.review_count == 0:
145
+ return 0.0
146
+ return float(round(sum([r.rating for r in self.reviews]) / self.review_count, decimals))
147
+
148
+ @staticmethod
149
+ def for_ids(ids: List[str]):
150
+ return[Product.all[i] for i in ids]
151
+
152
+ class Review:
153
+ all = {}
154
+
155
+ def __init__(self, id, rating, review_text, product):
156
+ self.id = id
157
+ self.rating = rating
158
+ self.review_text = review_text
159
+ self.product = product
160
+
161
+
162
+ if __name__ == "__main__":
163
+ DataLoader.load_data()
164
+ print('test')