File size: 1,062 Bytes
d7fdb42
0aaae9b
360f505
 
 
d7fdb42
8b1c859
10ddae5
 
 
 
 
eeafaaa
34b78ab
 
d7fdb42
 
 
eeafaaa
 
d7fdb42
 
34b78ab
 
 
 
 
8b1c859
 
cfc7185
34b78ab
8b1c859
73fea8e
 
 
eeafaaa
 
10ddae5
8b1c859
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
LANCEDB_DIRECTORY = "lancedb"
LANCEDB_TABLE_NAME = "table"
VECTOR_COLUMN_NAME = "embedding"
TEXT_COLUMN_NAME = "text"
DOCUMENT_PATH_COLUMN_NAME = "document_path"

CHUNK_POLICY = "md"
# CHUNK_POLICY = "txt"

EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# EMBED_NAME = "text-embedding-ada-002"

TOP_K_RANK = 50
TOP_K_RERANK = 5

emb_sizes = {
    "sentence-transformers/all-MiniLM-L6-v2": 384,
    "thenlper/gte-large": 1024,
    "text-embedding-ada-002": 1536,
}

thresh_distances = {
    "sentence-transformers/all-MiniLM-L6-v2": 1.2,
    "text-embedding-ada-002": 0.5,
}

context_lengths = {
    "mistralai/Mistral-7B-Instruct-v0.1": 4096,
    "tiiuae/falcon-180B-chat": 2048,
    "GeneZC/MiniChat-3B": 4096,
    "gpt-3.5-turbo": 4096,
    "gpt-4": 8192,
    "gpt-4-1106-preview": 128000,
    "gpt-3.5-turbo-1106": 16385,
    "sentence-transformers/all-MiniLM-L6-v2": 128,
    "thenlper/gte-large": 512,
    "text-embedding-ada-002": 1000,  # actual context length is 8191, but it's too much
}