llm-arch / src /data_synthesis /data_loader.py
alfraser's picture
Updated naming convention for dataset databases to be clearer.
8a677b0
raw
history blame
4.06 kB
"""
This script contains the second step of data synthesis - loading all of the data which was generated
in the json files into an sqlite rdbms structure for further analysis and usage. This results in the
exhaustive dataset - not a trimmed down dataset for a particular usage.
"""
import os
import sqlite3
from src.common import data_dir
from src.data_synthesis.generate_data import get_categories_and_features, products_for_category
def db_file() -> str:
output_file_name = f"01_all_products_dataset.db"
return os.path.join(data_dir, 'sqlite', output_file_name)
def setup_db_tables() -> None:
"""
Drop all the tables in the database and then re-build the structure empty
"""
con = sqlite3.connect(db_file())
tables = ['reviews', 'product_features', 'features', 'products', 'categories']
for t in tables:
con.execute(f'DROP TABLE IF EXISTS {t}')
# Create categories table
sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);"
con.execute(sql)
# Create features
sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create products
sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create product / feature link
sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))"
con.execute(sql)
# Create reviews
sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))"
con.execute(sql)
def insert_data() -> None:
"""
Insert the data from the json data files into the data structure
"""
con = sqlite3.connect(db_file())
cur = con.cursor()
cats_and_features = get_categories_and_features()
for cat, features in cats_and_features.items():
sql = f"INSERT INTO categories('name') VALUES ('{cat}')"
cat_id = con.execute(sql).lastrowid
con.commit()
sql = f"INSERT INTO features('name', 'category_id') VALUES "
values = [f"('{f}', {cat_id})" for f in features]
sql += ', '.join(values)
con.execute(sql)
con.commit()
for prod in products_for_category(cat):
sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)"
cur.execute(sql, (prod.name, prod.description, prod.price, cat_id))
prod_id = cur.lastrowid
con.commit()
for feat in prod.features:
sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}"
cur.execute(sql)
rows = cur.fetchall()
if len(rows) == 0:
print(f"Feature {feat} not found in category {cat} but used for {prod.name}")
sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)"
cur.execute(sql, (feat, cat_id))
feat_id = cur.lastrowid
else:
feat_id = rows[0][0]
sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})"
con.execute(sql)
for review in prod.reviews:
sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)"
cur.execute(sql, (prod_id, review.stars, review.review_text))
con.commit()
if __name__ == "__main__":
setup_db_tables()
insert_data()