LOUIS SANNA commited on
Commit
5bccbe7
1 Parent(s): 15de6ea

feat(chunks): avoid small chunks

Browse files
Files changed (1) hide show
  1. anyqa/build_index.py +3 -0
anyqa/build_index.py CHANGED
@@ -10,6 +10,7 @@ from .config import get_sources
10
  from .embeddings import EMBEDDING_MODEL_NAME
11
  from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
12
 
 
13
 
14
  def load_data():
15
  print("Loading data...")
@@ -39,6 +40,8 @@ def parse_data():
39
  doc_chunks = text_splitter.split_documents(pages)
40
 
41
  for chunk in doc_chunks:
 
 
42
  chunk.metadata["name"] = source["name"]
43
  chunk.metadata["domain"] = source["domain"]
44
  url = source.get("url", None)
 
10
  from .embeddings import EMBEDDING_MODEL_NAME
11
  from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
12
 
13
+ MIN_CHUNK_SIZE = 100
14
 
15
  def load_data():
16
  print("Loading data...")
 
40
  doc_chunks = text_splitter.split_documents(pages)
41
 
42
  for chunk in doc_chunks:
43
+ if len(chunk.page_content) < MIN_CHUNK_SIZE:
44
+ continue
45
  chunk.metadata["name"] = source["name"]
46
  chunk.metadata["domain"] = source["domain"]
47
  url = source.get("url", None)