Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
•
5bccbe7
1
Parent(s):
15de6ea
feat(chunks): avoid small chunks
Browse files- anyqa/build_index.py +3 -0
anyqa/build_index.py
CHANGED
@@ -10,6 +10,7 @@ from .config import get_sources
|
|
10 |
from .embeddings import EMBEDDING_MODEL_NAME
|
11 |
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
12 |
|
|
|
13 |
|
14 |
def load_data():
|
15 |
print("Loading data...")
|
@@ -39,6 +40,8 @@ def parse_data():
|
|
39 |
doc_chunks = text_splitter.split_documents(pages)
|
40 |
|
41 |
for chunk in doc_chunks:
|
|
|
|
|
42 |
chunk.metadata["name"] = source["name"]
|
43 |
chunk.metadata["domain"] = source["domain"]
|
44 |
url = source.get("url", None)
|
|
|
10 |
from .embeddings import EMBEDDING_MODEL_NAME
|
11 |
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
12 |
|
13 |
+
MIN_CHUNK_SIZE = 100
|
14 |
|
15 |
def load_data():
|
16 |
print("Loading data...")
|
|
|
40 |
doc_chunks = text_splitter.split_documents(pages)
|
41 |
|
42 |
for chunk in doc_chunks:
|
43 |
+
if len(chunk.page_content) < MIN_CHUNK_SIZE:
|
44 |
+
continue
|
45 |
chunk.metadata["name"] = source["name"]
|
46 |
chunk.metadata["domain"] = source["domain"]
|
47 |
url = source.get("url", None)
|