Spaces:
Runtime error
Runtime error
""" | |
This script contains the second step of data synthesis - loading all of the data which was generated | |
in the json files into an sqlite rdbms structure for further analysis and usage. This results in the | |
exhaustive dataset - not a trimmed down dataset for a particular usage. | |
""" | |
import os | |
import sqlite3 | |
from src.common import data_dir | |
from src.data_synthesis.generate_data import get_categories_and_features, products_for_category | |
def db_file() -> str: | |
output_file_name = f"01_all_products_dataset.db" | |
return os.path.join(data_dir, 'sqlite', output_file_name) | |
def setup_db_tables() -> None: | |
""" | |
Drop all the tables in the database and then re-build the structure empty | |
""" | |
con = sqlite3.connect(db_file()) | |
tables = ['reviews', 'product_features', 'features', 'products', 'categories'] | |
for t in tables: | |
con.execute(f'DROP TABLE IF EXISTS {t}') | |
# Create categories table | |
sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);" | |
con.execute(sql) | |
# Create features | |
sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))" | |
con.execute(sql) | |
# Create products | |
sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))" | |
con.execute(sql) | |
# Create product / feature link | |
sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))" | |
con.execute(sql) | |
# Create reviews | |
sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))" | |
con.execute(sql) | |
def insert_data() -> None: | |
""" | |
Insert the data from the json data files into the data structure | |
""" | |
con = sqlite3.connect(db_file()) | |
cur = con.cursor() | |
cats_and_features = get_categories_and_features() | |
for cat, features in cats_and_features.items(): | |
sql = f"INSERT INTO categories('name') VALUES ('{cat}')" | |
cat_id = con.execute(sql).lastrowid | |
con.commit() | |
sql = f"INSERT INTO features('name', 'category_id') VALUES " | |
values = [f"('{f}', {cat_id})" for f in features] | |
sql += ', '.join(values) | |
con.execute(sql) | |
con.commit() | |
for prod in products_for_category(cat): | |
sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)" | |
cur.execute(sql, (prod.name, prod.description, prod.price, cat_id)) | |
prod_id = cur.lastrowid | |
con.commit() | |
for feat in prod.features: | |
sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}" | |
cur.execute(sql) | |
rows = cur.fetchall() | |
if len(rows) == 0: | |
print(f"Feature {feat} not found in category {cat} but used for {prod.name}") | |
sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)" | |
cur.execute(sql, (feat, cat_id)) | |
feat_id = cur.lastrowid | |
else: | |
feat_id = rows[0][0] | |
sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})" | |
con.execute(sql) | |
for review in prod.reviews: | |
sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)" | |
cur.execute(sql, (prod_id, review.stars, review.review_text)) | |
con.commit() | |
if __name__ == "__main__": | |
setup_db_tables() | |
insert_data() | |