llm-arch / src /data_synthesis /data_loader.py
alfraser's picture
Updated the dataloader to ignore case on product features. Note have deliberately not harmonised product feature capitalisation or similar features as this "messiness" is representative of real organisational data.
b5d446f
raw
history blame
4.05 kB
"""
This script contains the second step of data synthesis - loading all of the data which was generated
in the json files into an sqlite rdbms structure for further analysis and usage. This results in the
exhaustive dataset - not a trimmed down dataset for a particular usage.
"""
import os
import sqlite3
from src.common import data_dir
from src.data_synthesis.generate_data import get_categories_and_features, products_for_category
def db_file() -> str:
output_file_name = f"products_dataset.db"
return os.path.join(data_dir, 'sqlite', output_file_name)
def setup_db_tables() -> None:
"""
Drop all the tables in the database and then re-build the structure empty
"""
con = sqlite3.connect(db_file())
tables = ['reviews', 'product_features', 'features', 'products', 'categories']
for t in tables:
con.execute(f'DROP TABLE IF EXISTS {t}')
# Create categories table
sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);"
con.execute(sql)
# Create features
sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create products
sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
con.execute(sql)
# Create product / feature link
sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))"
con.execute(sql)
# Create reviews
sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))"
con.execute(sql)
def insert_data() -> None:
"""
Insert the data from the json data files into the data structure
"""
con = sqlite3.connect(db_file())
cur = con.cursor()
cats_and_features = get_categories_and_features()
for cat, features in cats_and_features.items():
sql = f"INSERT INTO categories('name') VALUES ('{cat}')"
cat_id = con.execute(sql).lastrowid
con.commit()
sql = f"INSERT INTO features('name', 'category_id') VALUES "
values = [f"('{f}', {cat_id})" for f in features]
sql += ', '.join(values)
con.execute(sql)
con.commit()
for prod in products_for_category(cat):
sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)"
cur.execute(sql, (prod.name, prod.description, prod.price, cat_id))
prod_id = cur.lastrowid
con.commit()
for feat in prod.features:
sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}"
cur.execute(sql)
rows = cur.fetchall()
if len(rows) == 0:
print(f"Feature {feat} not found in category {cat} but used for {prod.name}")
sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)"
cur.execute(sql, (feat, cat_id))
feat_id = cur.lastrowid
else:
feat_id = rows[0][0]
sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})"
con.execute(sql)
for review in prod.reviews:
sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)"
cur.execute(sql, (prod_id, review.stars, review.review_text))
con.commit()
if __name__ == "__main__":
setup_db_tables()
insert_data()