Spaces:
Runtime error
Runtime error
import sqlite3 | |
import zlib | |
import numpy as np | |
SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
name TEXT NOT NULL, | |
display_name TEXT, | |
note TEXT, | |
UNIQUE(name) | |
)""" | |
VERSION_TABLE = r"""CREATE TABLE IF NOT EXISTS versions ( | |
source INTEGER, | |
version INTEGER, | |
parser TEXT, | |
note TEXT, | |
PRIMARY KEY (version, source, parser) | |
FOREIGN KEY (source) REFERENCES sources (id) | |
)""" | |
CHUNKING_TABLE = r"""CREATE TABLE IF NOT EXISTS chunkings ( | |
chunking INTEGER PRIMARY KEY AUTOINCREMENT, | |
size INTEGER, | |
overlap INTEGER, | |
strategy TEXT, | |
chunker TEXT, | |
source INTEGER, | |
version INTEGER, | |
UNIQUE (size, overlap, strategy, chunker, source, version), | |
FOREIGN KEY (source, version) REFERENCES versions (source, version) | |
)""" | |
SECTION_TABLE = r"""CREATE TABLE IF NOT EXISTS sections ( | |
source INTEGER, | |
version INTEGER, | |
section INTEGER, | |
title TEXT NOT NULL, | |
url TEXT NOT NULL, | |
content TEXT NOT NULL, | |
parent INTEGER, | |
type TEXT, | |
PRIMARY KEY (version, source, section), | |
FOREIGN KEY (source) REFERENCES versions (source), | |
FOREIGN KEY (version) REFERENCES versions (version) | |
)""" | |
CHUNK_TABLE = r"""CREATE TABLE IF NOT EXISTS chunks ( | |
source INTEGER, | |
version INTEGER, | |
section INTEGER, | |
chunking INTEGER, | |
sequence INTEGER, | |
content TEXT NOT NULL, | |
n_tokens INTEGER, | |
embedding VECTOR, | |
PRIMARY KEY (source, version, section, chunking, sequence), | |
FOREIGN KEY (source, version, section) REFERENCES sections (source, version, section), | |
FOREIGN KEY (source, version, chunking) REFERENCES chunkings (source, version, chunking) | |
)""" | |
VERSION_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_version ( | |
name, source, version) AS | |
SELECT sources.name, versions.source, max(versions.version) | |
FROM sources INNER JOIN versions on sources.id = versions.source | |
GROUP BY sources.id | |
""" | |
CHUNKING_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_chunking ( | |
name, source, version, chunking) AS | |
SELECT name, source, version, max(chunking) FROM | |
chunkings INNER JOIN latest_version USING (source, version) | |
GROUP by source, version | |
""" | |
DOCUMENT_VIEW = r"""CREATE VIEW IF NOT EXISTS documents ( | |
source, title, url, content, n_tokens, embedding) | |
AS SELECT latest_chunking.name, sections.title, sections.url, | |
chunks.content, chunks.n_tokens, chunks.embedding | |
FROM chunks INNER JOIN sections USING (source, version, section) | |
INNER JOIN latest_chunking USING (source, version, chunking) | |
""" | |
INIT_STATEMENTS = [ | |
SOURCE_TABLE, | |
VERSION_TABLE, | |
CHUNKING_TABLE, | |
SECTION_TABLE, | |
CHUNK_TABLE, | |
VERSION_VIEW, | |
CHUNKING_VIEW, | |
DOCUMENT_VIEW, | |
] | |
def initialize_db(connection: sqlite3.Connection): | |
for statement in INIT_STATEMENTS: | |
try: | |
connection.execute(statement) | |
except sqlite3.Error as error: | |
connection.rollback() | |
raise | |
connection.commit() | |
return connection | |
def adapt_vector(vector: np.ndarray) -> bytes: | |
return sqlite3.Binary(zlib.compress(vector.astype(np.float32).tobytes())) | |
def convert_vector(buffer: bytes) -> np.ndarray: | |
return np.frombuffer(zlib.decompress(buffer), dtype=np.float32) | |
def cosine_similarity(a: bytes, b: bytes) -> float: | |
a = convert_vector(a) | |
b = convert_vector(b) | |
a = a / np.linalg.norm(a) | |
b = b / np.linalg.norm(b) | |
dopt = 0.5 * np.dot(a, b) + 0.5 | |
return float(dopt) | |
def setup_db(connection: sqlite3.Connection): | |
sqlite3.register_adapter(np.ndarray, adapt_vector) | |
sqlite3.register_converter("vector", convert_vector) | |
connection.create_function("sim", 2, cosine_similarity, deterministic=True) | |