Spaces:

alfraser
/

llm-arch

Runtime error

llm-arch / src /data_synthesis /data_loader.py

Updated the dataloader to ignore case on product features. Note have deliberately not harmonised product feature capitalisation or similar features as this "messiness" is representative of real organisational data.

b5d446f over 1 year ago

raw

history blame

4.05 kB

	"""
	This script contains the second step of data synthesis - loading all of the data which was generated
	in the json files into an sqlite rdbms structure for further analysis and usage. This results in the
	exhaustive dataset - not a trimmed down dataset for a particular usage.
	"""

	import os
	import sqlite3

	from src.common import data_dir
	from src.data_synthesis.generate_data import get_categories_and_features, products_for_category


	def db_file() -> str:
	output_file_name = f"products_dataset.db"
	return os.path.join(data_dir, 'sqlite', output_file_name)


	def setup_db_tables() -> None:
	"""
	Drop all the tables in the database and then re-build the structure empty
	"""
	con = sqlite3.connect(db_file())
	tables = ['reviews', 'product_features', 'features', 'products', 'categories']
	for t in tables:
	con.execute(f'DROP TABLE IF EXISTS {t}')

	# Create categories table
	sql = "CREATE TABLE categories (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);"
	con.execute(sql)

	# Create features
	sql = "CREATE TABLE features (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
	con.execute(sql)

	# Create products
	sql = "CREATE TABLE products (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT NOT NULL, price REAL NOT NULL, category_id INTEGER NOT NULL, FOREIGN KEY (category_id) REFERENCES categories (id))"
	con.execute(sql)

	# Create product / feature link
	sql = "CREATE TABLE product_features (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, feature_id INTEGER NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id), FOREIGN KEY (feature_id) REFERENCES features (id))"
	con.execute(sql)

	# Create reviews
	sql = "CREATE TABLE reviews (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, rating INTEGER NOT NULL, review_text TEXT NOT NULL, FOREIGN KEY (product_id) REFERENCES products (id))"
	con.execute(sql)


	def insert_data() -> None:
	"""
	Insert the data from the json data files into the data structure
	"""
	con = sqlite3.connect(db_file())
	cur = con.cursor()

	cats_and_features = get_categories_and_features()

	for cat, features in cats_and_features.items():
	sql = f"INSERT INTO categories('name') VALUES ('{cat}')"
	cat_id = con.execute(sql).lastrowid
	con.commit()
	sql = f"INSERT INTO features('name', 'category_id') VALUES "
	values = [f"('{f}', {cat_id})" for f in features]
	sql += ', '.join(values)
	con.execute(sql)
	con.commit()

	for prod in products_for_category(cat):
	sql = f"INSERT INTO products('name', 'description', 'price', 'category_id') VALUES (?, ?, ?, ?)"
	cur.execute(sql, (prod.name, prod.description, prod.price, cat_id))
	prod_id = cur.lastrowid
	con.commit()

	for feat in prod.features:
	sql = f"SELECT id from features WHERE lower(name)='{feat.lower()}' AND category_id={cat_id}"
	cur.execute(sql)
	rows = cur.fetchall()
	if len(rows) == 0:
	print(f"Feature {feat} not found in category {cat} but used for {prod.name}")
	sql = f"INSERT INTO features('name', 'category_id') VALUES (?, ?)"
	cur.execute(sql, (feat, cat_id))
	feat_id = cur.lastrowid
	else:
	feat_id = rows[0][0]
	sql = f"INSERT INTO product_features('product_id', 'feature_id') VALUES ({prod_id}, {feat_id})"
	con.execute(sql)

	for review in prod.reviews:
	sql = f"INSERT INTO reviews('product_id', 'rating', 'review_text') VALUES (?, ?, ?)"
	cur.execute(sql, (prod_id, review.stars, review.review_text))
	con.commit()


	if __name__ == "__main__":
	setup_db_tables()
	insert_data()