File size: 3,753 Bytes
fb83544
97aefb5
 
 
 
 
 
 
6aad21a
97aefb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb83544
97aefb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import sqlite3
import zlib

import numpy as np

SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT NOT NULL,
    display_name TEXT,
    note TEXT,
    UNIQUE(name)
)"""


VERSION_TABLE = r"""CREATE TABLE IF NOT EXISTS versions (
    source INTEGER,
    version INTEGER,
    parser TEXT,
    note TEXT,
    PRIMARY KEY (version, source, parser)
    FOREIGN KEY (source) REFERENCES sources (id)
)"""


CHUNKING_TABLE = r"""CREATE TABLE IF NOT EXISTS chunkings (
    chunking INTEGER PRIMARY KEY AUTOINCREMENT,
    size INTEGER,
    overlap INTEGER,
    strategy TEXT,
    chunker TEXT,
    source INTEGER,
    version INTEGER,
    UNIQUE (size, overlap, strategy, chunker, source, version),
    FOREIGN KEY (source, version) REFERENCES versions (source, version)
)"""


SECTION_TABLE = r"""CREATE TABLE IF NOT EXISTS sections (
    source INTEGER,
    version INTEGER,
    section INTEGER,
    title TEXT NOT NULL,
    url TEXT NOT NULL,
    content TEXT NOT NULL,
    parent INTEGER,
    type TEXT,
    PRIMARY KEY (version, source, section),
    FOREIGN KEY (source) REFERENCES versions (source),
    FOREIGN KEY (version) REFERENCES versions (version)
)"""


CHUNK_TABLE = r"""CREATE TABLE IF NOT EXISTS chunks (
    source INTEGER,
    version INTEGER,
    section INTEGER,
    chunking INTEGER,
    sequence INTEGER,
    content TEXT NOT NULL,
    n_tokens INTEGER,
    embedding VECTOR,
    PRIMARY KEY (source, version, section, chunking, sequence),
    FOREIGN KEY (source, version, section) REFERENCES sections (source, version, section),
    FOREIGN KEY (source, version, chunking) REFERENCES chunkings (source, version, chunking)
)"""


VERSION_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_version (
    name, source, version) AS
    SELECT sources.name, versions.source, max(versions.version)
    FROM sources INNER JOIN versions on sources.id = versions.source
    GROUP BY sources.id
"""

CHUNKING_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_chunking (
    name, source, version, chunking) AS
    SELECT name, source, version, max(chunking) FROM
    chunkings INNER JOIN latest_version USING (source, version)
    GROUP by source, version
"""

DOCUMENT_VIEW = r"""CREATE VIEW IF NOT EXISTS documents (
    source, title, url, content, n_tokens, embedding)
    AS SELECT latest_chunking.name, sections.title, sections.url,
    chunks.content, chunks.n_tokens, chunks.embedding
    FROM chunks INNER JOIN sections USING (source, version, section)
    INNER JOIN latest_chunking USING (source, version, chunking)
"""


INIT_STATEMENTS = [
    SOURCE_TABLE,
    VERSION_TABLE,
    CHUNKING_TABLE,
    SECTION_TABLE,
    CHUNK_TABLE,
    VERSION_VIEW,
    CHUNKING_VIEW,
    DOCUMENT_VIEW,
]


def initialize_db(connection: sqlite3.Connection):
    for statement in INIT_STATEMENTS:
        try:
            connection.execute(statement)
        except sqlite3.Error as error:
            connection.rollback()
            raise
    connection.commit()
    return connection


def adapt_vector(vector: np.ndarray) -> bytes:
    return sqlite3.Binary(zlib.compress(vector.astype(np.float32).tobytes()))


def convert_vector(buffer: bytes) -> np.ndarray:
    return np.frombuffer(zlib.decompress(buffer), dtype=np.float32)


def cosine_similarity(a: bytes, b: bytes) -> float:
    a = convert_vector(a)
    b = convert_vector(b)
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b)
    dopt = 0.5 * np.dot(a, b) + 0.5
    return float(dopt)


def setup_db(connection: sqlite3.Connection):
    sqlite3.register_adapter(np.ndarray, adapt_vector)
    sqlite3.register_converter("vector", convert_vector)
    connection.create_function("sim", 2, cosine_similarity, deterministic=True)