AlexanderKazakov commited on
Commit
eeafaaa
1 Parent(s): d7fdb42

small improvement for chunking; openai embeddings

Browse files
README initial.md CHANGED
@@ -13,7 +13,7 @@ Deliberately stripped down to leave some room for experimenting
13
  - TODOs:
14
  - Experiment with chunking, see how it affects the results. When deciding how to chunk it helps to think about what kind of chunks you'd like to see as context to your queries.
15
  - Deliverables: Demonstrate how retrieved documents differ with different chunking strategies and how it affects the output.
16
- - Try out different embedding models (EMB_MODEL_NAME). Good models to start with are **sentence-transformers/all-MiniLM-L6-v2** - lightweight, **thenlper/gte-large** - relatively heavy but more powerful.
17
  - Deliverables: Demonstrate how retrieved documents differ with different embedding models and how they affect the output. Provide an estimate of how the time to embed the chunks and DB ingestion time differs (happening in **prep_scrips/lancedb_setup.py**).
18
  - Add a re-ranker (cross-encoder) to the pipeline. Start with sentence-transformers pages on cross-encoders [1](https://www.sbert.net/examples/applications/cross-encoder/README.html) [2](https://www.sbert.net/examples/applications/retrieve_rerank/README.html), then pick a [pretrained cross-encoder](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html), e.g. **cross-encoder/ms-marco-MiniLM-L-12-v2**. Don't forget to increase the number of *retrieved* documents when using re-ranker. The number of documents used as context should stay the same.
19
  - Deliverables: Demonstrate how retrieved documents differ after adding a re-ranker and how it affects the output. Provide an estimate of how latency changes.
 
13
  - TODOs:
14
  - Experiment with chunking, see how it affects the results. When deciding how to chunk it helps to think about what kind of chunks you'd like to see as context to your queries.
15
  - Deliverables: Demonstrate how retrieved documents differ with different chunking strategies and how it affects the output.
16
+ - Try out different embedding models (EMBED_NAME). Good models to start with are **sentence-transformers/all-MiniLM-L6-v2** - lightweight, **thenlper/gte-large** - relatively heavy but more powerful.
17
  - Deliverables: Demonstrate how retrieved documents differ with different embedding models and how they affect the output. Provide an estimate of how the time to embed the chunks and DB ingestion time differs (happening in **prep_scrips/lancedb_setup.py**).
18
  - Add a re-ranker (cross-encoder) to the pipeline. Start with sentence-transformers pages on cross-encoders [1](https://www.sbert.net/examples/applications/cross-encoder/README.html) [2](https://www.sbert.net/examples/applications/retrieve_rerank/README.html), then pick a [pretrained cross-encoder](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html), e.g. **cross-encoder/ms-marco-MiniLM-L-12-v2**. Don't forget to increase the number of *retrieved* documents when using re-ranker. The number of documents used as context should stay the same.
19
  - Deliverables: Demonstrate how retrieved documents differ after adding a re-ranker and how it affects the output. Provide an estimate of how latency changes.
gradio_app/app.py CHANGED
@@ -9,6 +9,7 @@ import logging
9
  from time import perf_counter
10
 
11
  import gradio as gr
 
12
  from jinja2 import Environment, FileSystemLoader
13
 
14
  from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
@@ -31,10 +32,10 @@ context_html_template = env.get_template('context_html_template.j2')
31
  # Examples
32
  examples = [
33
  'What is BERT?',
34
- 'Tell me about BERT deep learning model',
 
35
  'What is the capital of China?',
36
  'Why is the sky blue?',
37
- 'Who won the mens world cup in 2014?',
38
  ]
39
 
40
 
@@ -58,7 +59,7 @@ def bot(history, api_kind):
58
  # Retrieve documents relevant to query
59
  document_start = perf_counter()
60
 
61
- query_vec = embedder.encode(query)
62
  documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
63
  thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
64
  documents = [d for d in documents if d['_distance'] <= thresh_dist]
@@ -69,10 +70,11 @@ def bot(history, api_kind):
69
 
70
  while len(documents) != 0:
71
  context = context_template.render(documents=documents)
72
- context_html = context_html_template.render(documents=documents)
 
73
  messages = construct_openai_messages(context, history)
74
- num_tokens = num_tokens_from_messages(messages, OPENAI_LLM_NAME)
75
- if num_tokens + 512 < context_lengths[OPENAI_LLM_NAME]:
76
  break
77
  documents.pop()
78
  else:
 
9
  from time import perf_counter
10
 
11
  import gradio as gr
12
+ import markdown
13
  from jinja2 import Environment, FileSystemLoader
14
 
15
  from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
 
32
  # Examples
33
  examples = [
34
  'What is BERT?',
35
+ 'Tell me about GPT',
36
+ 'How to use accelerate in google colab?',
37
  'What is the capital of China?',
38
  'Why is the sky blue?',
 
39
  ]
40
 
41
 
 
59
  # Retrieve documents relevant to query
60
  document_start = perf_counter()
61
 
62
+ query_vec = embedder.embed(query)[0]
63
  documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
64
  thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
65
  documents = [d for d in documents if d['_distance'] <= thresh_dist]
 
70
 
71
  while len(documents) != 0:
72
  context = context_template.render(documents=documents)
73
+ documents_html = [markdown.markdown(d) for d in documents]
74
+ context_html = context_html_template.render(documents=documents_html)
75
  messages = construct_openai_messages(context, history)
76
+ num_tokens = num_tokens_from_messages(messages, LLM_NAME)
77
+ if num_tokens + 512 < context_lengths[LLM_NAME]:
78
  break
79
  documents.pop()
80
  else:
gradio_app/backend/embedders.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import openai
3
+ from sentence_transformers import SentenceTransformer
4
+ from abc import ABC, abstractmethod
5
+
6
+
7
+ class Embedder(ABC):
8
+ @abstractmethod
9
+ def embed(self, texts):
10
+ pass
11
+
12
+
13
+ class HfEmbedder(Embedder):
14
+ def __init__(self, model_name):
15
+ self.model = SentenceTransformer(model_name)
16
+ self.model.eval()
17
+
18
+ @torch.no_grad()
19
+ def embed(self, texts):
20
+ encoded = self.model.encode(texts, normalize_embeddings=True)
21
+ return [list(vec) for vec in encoded]
22
+
23
+
24
+ class OpenAIEmbedder(Embedder):
25
+ def __init__(self, model_name):
26
+ self.model_name = model_name
27
+
28
+ def embed(self, texts):
29
+ responses = openai.Embedding.create(input=texts, engine=self.model_name)
30
+ return [response['embedding'] for response in responses['data']]
31
+
32
+
33
+ class EmbedderFactory:
34
+ @staticmethod
35
+ def get_embedder(type):
36
+ if type == "sentence-transformers/all-MiniLM-L6-v2":
37
+ return HfEmbedder(type)
38
+ elif type == "text-embedding-ada-002":
39
+ return OpenAIEmbedder(type)
40
+ else:
41
+ raise ValueError(f"Unsupported embedder type: {type}")
42
+
43
+
gradio_app/backend/query_llm.py CHANGED
@@ -2,19 +2,17 @@ import gradio as gr
2
 
3
  from typing import Any, Dict, Generator, List
4
 
5
- from huggingface_hub import InferenceClient
6
- from transformers import AutoTokenizer
7
  from jinja2 import Environment, FileSystemLoader
8
 
9
  from settings import *
10
  from gradio_app.backend.ChatGptInteractor import *
11
 
12
 
13
- tokenizer = AutoTokenizer.from_pretrained(HF_LLM_NAME)
14
-
15
- HF_TOKEN = None
16
-
17
- hf_client = InferenceClient(HF_LLM_NAME, token=HF_TOKEN)
18
 
19
 
20
  def format_prompt(message: str, api_kind: str):
@@ -125,7 +123,7 @@ def construct_openai_messages(context, history):
125
 
126
 
127
  def generate_openai(messages):
128
- cgi = ChatGptInteractor(model_name=OPENAI_LLM_NAME)
129
  for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
130
  yield cgi.get_stream_text(part)
131
 
@@ -162,7 +160,7 @@ def _generate_openai(prompt: str, history: str, temperature: float = 0.9, max_ne
162
 
163
  try:
164
  stream = openai.ChatCompletion.create(
165
- model=OPENAI_LLM_NAME,
166
  messages=formatted_prompt,
167
  **generate_kwargs,
168
  stream=True
 
2
 
3
  from typing import Any, Dict, Generator, List
4
 
5
+ # from huggingface_hub import InferenceClient
6
+ # from transformers import AutoTokenizer
7
  from jinja2 import Environment, FileSystemLoader
8
 
9
  from settings import *
10
  from gradio_app.backend.ChatGptInteractor import *
11
 
12
 
13
+ # tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
14
+ # HF_TOKEN = None
15
+ # hf_client = InferenceClient(LLM_NAME, token=HF_TOKEN)
 
 
16
 
17
 
18
  def format_prompt(message: str, api_kind: str):
 
123
 
124
 
125
  def generate_openai(messages):
126
+ cgi = ChatGptInteractor(model_name=LLM_NAME)
127
  for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
128
  yield cgi.get_stream_text(part)
129
 
 
160
 
161
  try:
162
  stream = openai.ChatCompletion.create(
163
+ model=LLM_NAME,
164
  messages=formatted_prompt,
165
  **generate_kwargs,
166
  stream=True
gradio_app/backend/semantic_search.py CHANGED
@@ -1,14 +1,14 @@
1
  import logging
2
  import lancedb
3
- from sentence_transformers import SentenceTransformer
4
 
 
5
  from settings import *
6
 
7
 
8
  # Setting up the logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
- embedder = SentenceTransformer(EMB_MODEL_NAME)
12
 
13
  db = lancedb.connect(LANCEDB_DIRECTORY)
14
  table = db.open_table(LANCEDB_TABLE_NAME)
 
1
  import logging
2
  import lancedb
 
3
 
4
+ from gradio_app.backend.embedders import EmbedderFactory
5
  from settings import *
6
 
7
 
8
  # Setting up the logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
+ embedder = EmbedderFactory.get_embedder(EMBED_NAME)
12
 
13
  db = lancedb.connect(LANCEDB_DIRECTORY)
14
  table = db.open_table(LANCEDB_TABLE_NAME)
gradio_app/templates/context_html_template.j2 CHANGED
@@ -11,85 +11,25 @@
11
  font-family: "Source Sans Pro";
12
  }
13
 
14
- .instructions > * {
15
- color: #111 !important;
16
- }
17
-
18
- details.doc-box * {
19
- color: #111 !important;
20
- }
21
-
22
- .dark {
23
- background: #111;
24
- color: white;
25
- }
26
-
27
  .doc-box {
28
  padding: 10px;
29
  margin-top: 10px;
30
- background-color: #baecc2;
31
  border-radius: 6px;
32
  color: #111 !important;
33
- max-width: 700px;
34
- box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
35
- }
36
-
37
- .doc-full {
38
- margin: 10px 14px;
39
- line-height: 1.6rem;
40
- }
41
-
42
- .instructions {
43
- color: #111 !important;
44
- background: #b7bdfd;
45
- display: block;
46
- border-radius: 6px;
47
- padding: 6px 10px;
48
- line-height: 1.6rem;
49
- max-width: 700px;
50
- box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
51
- }
52
-
53
- .query {
54
- color: #111 !important;
55
- background: #ffbcbc;
56
- display: block;
57
- border-radius: 6px;
58
- padding: 6px 10px;
59
- line-height: 1.6rem;
60
- max-width: 700px;
61
  box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
62
  }
63
  </style>
64
  </head>
65
  <body>
66
- <div class="prose svelte-1ybaih5" id="context_html">
67
  <h2>Context:</h2>
68
  {% for doc in documents %}
69
- <details class="doc-box">
70
- <summary>
71
- <b>Doc {{ loop.index }}:</b> <span class="doc-short">{{ doc[:1000] }}...</span>
72
- </summary>
73
- <div class="doc-full">{{ doc }}</div>
74
- </details>
75
  {% endfor %}
76
- </div>
77
 
78
- <script>
79
- document.addEventListener("DOMContentLoaded", function() {
80
- const detailsElements = document.querySelectorAll('.doc-box');
81
 
82
- detailsElements.forEach(detail => {
83
- detail.addEventListener('toggle', function() {
84
- const docShort = this.querySelector('.doc-short');
85
- if (this.open) {
86
- docShort.style.display = 'none';
87
- } else {
88
- docShort.style.display = 'inline';
89
- }
90
- });
91
- });
92
- });
93
- </script>
94
  </body>
95
  </html>
 
11
  font-family: "Source Sans Pro";
12
  }
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  .doc-box {
15
  padding: 10px;
16
  margin-top: 10px;
17
+ background-color: #374151;
18
  border-radius: 6px;
19
  color: #111 !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
21
  }
22
  </style>
23
  </head>
24
  <body>
25
+
26
  <h2>Context:</h2>
27
  {% for doc in documents %}
28
+ <div class="doc-box">
29
+ {{ doc }}
30
+ </div>
 
 
 
31
  {% endfor %}
 
32
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  </body>
35
  </html>
prep_scripts/lancedb_setup.py CHANGED
@@ -1,36 +1,29 @@
1
  import shutil
2
- import traceback
3
 
4
  import lancedb
5
- import torch
6
  import pyarrow as pa
7
  import pandas as pd
8
  from pathlib import Path
9
  import tqdm
10
  import numpy as np
11
 
12
- from sentence_transformers import SentenceTransformer
13
-
14
  from markdown_to_text import *
15
  from settings import *
16
 
17
 
 
 
 
 
 
18
  shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
19
  db = lancedb.connect(LANCEDB_DIRECTORY)
20
  batch_size = 32
21
 
22
- model = SentenceTransformer(EMB_MODEL_NAME)
23
- model.eval()
24
-
25
- if torch.backends.mps.is_available():
26
- device = "mps"
27
- elif torch.cuda.is_available():
28
- device = "cuda"
29
- else:
30
- device = "cpu"
31
-
32
  schema = pa.schema([
33
- pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMB_MODEL_NAME])),
34
  pa.field(TEXT_COLUMN_NAME, pa.string()),
35
  pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
36
  ])
@@ -49,17 +42,18 @@ for file in files:
49
  print(f'Skipped {file_ext} extension: {file}')
50
  continue
51
 
52
- doc_header = ' / '.join(split_path(file_path)) + ':\n\n'
53
  with open(file, encoding='utf-8') as f:
54
  f = f.read()
55
  f = remove_comments(f)
56
  f = split_markdown(f)
57
- chunks.extend((doc_header + chunk, os.path.abspath(file)) for chunk in f)
58
 
59
  from matplotlib import pyplot as plt
60
  plt.hist([len(c) for c, d in chunks], bins=100)
61
  plt.show()
62
 
 
 
63
  for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
64
  texts, doc_paths = [], []
65
  for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
@@ -67,9 +61,7 @@ for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
67
  texts.append(text)
68
  doc_paths.append(doc_path)
69
 
70
- encoded = model.encode(texts, normalize_embeddings=True, device=device)
71
- encoded = [list(vec) for vec in encoded]
72
-
73
  df = pd.DataFrame({
74
  VECTOR_COLUMN_NAME: encoded,
75
  TEXT_COLUMN_NAME: texts,
@@ -79,10 +71,4 @@ for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
79
  tbl.add(df)
80
 
81
 
82
- # '''
83
- # create ivf-pd index https://lancedb.github.io/lancedb/ann_indexes/
84
- # with the size of the transformer docs, index is not really needed
85
- # but we'll do it for demonstration purposes
86
- # '''
87
- # tbl.create_index(num_partitions=256, num_sub_vectors=96, vector_column_name=VECTOR_COLUMN_NAME)
88
 
 
1
  import shutil
 
2
 
3
  import lancedb
4
+ import openai
5
  import pyarrow as pa
6
  import pandas as pd
7
  from pathlib import Path
8
  import tqdm
9
  import numpy as np
10
 
11
+ from gradio_app.backend.embedders import EmbedderFactory
 
12
  from markdown_to_text import *
13
  from settings import *
14
 
15
 
16
+ with open('data/openaikey.txt') as f:
17
+ OPENAI_KEY = f.read().strip()
18
+ openai.api_key = OPENAI_KEY
19
+
20
+
21
  shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
22
  db = lancedb.connect(LANCEDB_DIRECTORY)
23
  batch_size = 32
24
 
 
 
 
 
 
 
 
 
 
 
25
  schema = pa.schema([
26
+ pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMBED_NAME])),
27
  pa.field(TEXT_COLUMN_NAME, pa.string()),
28
  pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
29
  ])
 
42
  print(f'Skipped {file_ext} extension: {file}')
43
  continue
44
 
 
45
  with open(file, encoding='utf-8') as f:
46
  f = f.read()
47
  f = remove_comments(f)
48
  f = split_markdown(f)
49
+ chunks.extend((chunk, os.path.abspath(file)) for chunk in f)
50
 
51
  from matplotlib import pyplot as plt
52
  plt.hist([len(c) for c, d in chunks], bins=100)
53
  plt.show()
54
 
55
+ embedder = EmbedderFactory.get_embedder(EMBED_NAME)
56
+
57
  for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
58
  texts, doc_paths = [], []
59
  for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
 
61
  texts.append(text)
62
  doc_paths.append(doc_path)
63
 
64
+ encoded = embedder.embed(texts)
 
 
65
  df = pd.DataFrame({
66
  VECTOR_COLUMN_NAME: encoded,
67
  TEXT_COLUMN_NAME: texts,
 
71
  tbl.add(df)
72
 
73
 
 
 
 
 
 
 
74
 
prep_scripts/markdown_to_text.py CHANGED
@@ -21,33 +21,26 @@ def remove_comments(md):
21
  return re.sub(r'<!--((.|\n)*)-->', '', md)
22
 
23
 
24
- header_pattern = re.compile(r'\n\s*\n(#{1,3})\s.*\n\s*\n')
25
 
26
 
27
  def split_content(content):
 
28
  _parts = content.split('\n\n')
29
  parts = []
30
  for p in _parts:
31
- if len(p) < 2 * TEXT_CHUNK_SIZE:
32
  parts.append(p)
33
  else:
34
  parts.extend(p.split('\n'))
35
 
36
  res = ['']
37
  for p in parts:
38
- if len(res[-1]) + len(p) < TEXT_CHUNK_SIZE:
39
  res[-1] += p + '\n\n'
40
  else:
41
  res.append(p + '\n\n')
42
 
43
- if (
44
- len(res) >= 2 and
45
- len(res[-1]) < TEXT_CHUNK_SIZE / 4 and
46
- len(res[-2]) < TEXT_CHUNK_SIZE
47
- ):
48
- res[-2] += res[-1]
49
- res.pop()
50
-
51
  return res
52
 
53
 
@@ -65,20 +58,30 @@ def split_markdown(md):
65
  chunk = ''
66
  for i in sorted(name_hierarchy):
67
  if len(name_hierarchy[i]) != 0:
68
- chunk += name_hierarchy[i] + '\n\n'
 
 
 
 
 
 
69
 
70
  chunk += content
71
  chunk = chunk.strip()
72
  res.append(chunk)
73
 
74
- md = f'\n\n{md}' # to find a header at the top of a file
 
75
  headers = list(header_pattern.finditer(md))
 
 
 
76
  name_hierarchy = {i: '' for i in (1, 2, 3)}
77
  res = []
78
  for i in range(len(headers)):
79
  header = headers[i]
80
  level = len(header.group(1))
81
- name = header.group().strip()
82
  name_hierarchy[level] = name
83
  if i == 0 and header.start() != 0:
84
  construct_chunks(md[:header.start()])
 
21
  return re.sub(r'<!--((.|\n)*)-->', '', md)
22
 
23
 
24
+ header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n')
25
 
26
 
27
  def split_content(content):
28
+ text_chunk_size = context_lengths[EMBED_NAME] - 32
29
  _parts = content.split('\n\n')
30
  parts = []
31
  for p in _parts:
32
+ if len(p) < text_chunk_size:
33
  parts.append(p)
34
  else:
35
  parts.extend(p.split('\n'))
36
 
37
  res = ['']
38
  for p in parts:
39
+ if len(res[-1]) + len(p) < text_chunk_size:
40
  res[-1] += p + '\n\n'
41
  else:
42
  res.append(p + '\n\n')
43
 
 
 
 
 
 
 
 
 
44
  return res
45
 
46
 
 
58
  chunk = ''
59
  for i in sorted(name_hierarchy):
60
  if len(name_hierarchy[i]) != 0:
61
+ j = i + 1
62
+ while j in name_hierarchy:
63
+ if name_hierarchy[j].find(name_hierarchy[i]) != -1:
64
+ break
65
+ j += 1
66
+ else:
67
+ chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n'
68
 
69
  chunk += content
70
  chunk = chunk.strip()
71
  res.append(chunk)
72
 
73
+ # to find a header at the top of a file
74
+ md = f'\n\n{md}'
75
  headers = list(header_pattern.finditer(md))
76
+ # only first header can be first-level
77
+ headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1]
78
+
79
  name_hierarchy = {i: '' for i in (1, 2, 3)}
80
  res = []
81
  for i in range(len(headers)):
82
  header = headers[i]
83
  level = len(header.group(1))
84
+ name = header.group(2).strip()
85
  name_hierarchy[level] = name
86
  if i == 0 and header.start() != 0:
87
  construct_chunks(md[:header.start()])
settings.py CHANGED
@@ -1,22 +1,26 @@
1
  MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
2
- EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
3
  LANCEDB_DIRECTORY = "data/lancedb"
4
  LANCEDB_TABLE_NAME = "table"
5
  VECTOR_COLUMN_NAME = "embedding"
6
  TEXT_COLUMN_NAME = "text"
7
  DOCUMENT_PATH_COLUMN_NAME = "document_path"
8
- HF_LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
9
- OPENAI_LLM_NAME = "gpt-3.5-turbo"
10
 
11
- """ in symbols, approximate, without headers """
12
- TEXT_CHUNK_SIZE = 1000
 
 
 
13
 
14
  emb_sizes = {
15
  "sentence-transformers/all-MiniLM-L6-v2": 384,
16
- "thenlper/gte-large": 0
 
17
  }
18
 
19
  context_lengths = {
20
  "mistralai/Mistral-7B-Instruct-v0.1": 4096,
21
  "gpt-3.5-turbo": 4096,
 
 
 
22
  }
 
1
  MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
 
2
  LANCEDB_DIRECTORY = "data/lancedb"
3
  LANCEDB_TABLE_NAME = "table"
4
  VECTOR_COLUMN_NAME = "embedding"
5
  TEXT_COLUMN_NAME = "text"
6
  DOCUMENT_PATH_COLUMN_NAME = "document_path"
 
 
7
 
8
+ # LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
9
+ LLM_NAME = "gpt-3.5-turbo"
10
+ # EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2"
11
+ EMBED_NAME = "text-embedding-ada-002"
12
+
13
 
14
  emb_sizes = {
15
  "sentence-transformers/all-MiniLM-L6-v2": 384,
16
+ "thenlper/gte-large": 1024,
17
+ "text-embedding-ada-002": 1536,
18
  }
19
 
20
  context_lengths = {
21
  "mistralai/Mistral-7B-Instruct-v0.1": 4096,
22
  "gpt-3.5-turbo": 4096,
23
+ "sentence-transformers/all-MiniLM-L6-v2": 128,
24
+ "thenlper/gte-large": 512,
25
+ "text-embedding-ada-002": 8191,
26
  }