tasal9 commited on
Commit
c6b6e51
·
verified ·
1 Parent(s): 7995949

Add indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +44 -44
indexer.py CHANGED
@@ -1,44 +1,44 @@
1
- """
2
- ZamAI Document Indexer
3
- This script helps add new documents to the embedding vector database.
4
- """
5
- import os
6
- import argparse
7
- from llama_index.readers.file import SimpleDirectoryReader
8
- from setup import setup_embedding_model
9
-
10
- def index_documents(corpus_path, db_path=None):
11
- """
12
- Index documents from the specified corpus path into the vector database.
13
-
14
- Args:
15
- corpus_path: Path to the directory containing documents to index
16
- db_path: Optional custom path for the ChromaDB database
17
- """
18
- if not os.path.exists(corpus_path):
19
- print(f"Error: Directory {corpus_path} does not exist.")
20
- return
21
-
22
- if not os.listdir(corpus_path):
23
- print(f"Error: No files found in {corpus_path}")
24
- return
25
-
26
- # Set up embedding model and components
27
- db_path = db_path or "./models/embeddings/chroma_db"
28
- embedding_components = setup_embedding_model(corpus_path=corpus_path, db_path=db_path)
29
-
30
- print(f"Successfully indexed documents from {corpus_path}")
31
- print(f"Vector database stored at {db_path}")
32
-
33
- # Return the components if needed for further processing
34
- return embedding_components
35
-
36
- if __name__ == "__main__":
37
- parser = argparse.ArgumentParser(description="Index documents for ZamAI embeddings")
38
- parser.add_argument("--corpus", type=str, default="data/text_corpus/",
39
- help="Path to the directory containing documents to index")
40
- parser.add_argument("--db", type=str, default=None,
41
- help="Path to store the ChromaDB database (optional)")
42
-
43
- args = parser.parse_args()
44
- index_documents(args.corpus, args.db)
 
1
+ """
2
+ ZamAI Document Indexer
3
+ This script helps add new documents to the embedding vector database.
4
+ """
5
+ import os
6
+ import argparse
7
+ from llama_index.readers.file import SimpleDirectoryReader
8
+ from setup import setup_embedding_model
9
+
10
+ def index_documents(corpus_path, db_path=None):
11
+ """
12
+ Index documents from the specified corpus path into the vector database.
13
+
14
+ Args:
15
+ corpus_path: Path to the directory containing documents to index
16
+ db_path: Optional custom path for the ChromaDB database
17
+ """
18
+ if not os.path.exists(corpus_path):
19
+ print(f"Error: Directory {corpus_path} does not exist.")
20
+ return
21
+
22
+ if not os.listdir(corpus_path):
23
+ print(f"Error: No files found in {corpus_path}")
24
+ return
25
+
26
+ # Set up embedding model and components
27
+ db_path = db_path or "./models/embeddings/chroma_db"
28
+ embedding_components = setup_embedding_model(corpus_path=corpus_path, db_path=db_path)
29
+
30
+ print(f"Successfully indexed documents from {corpus_path}")
31
+ print(f"Vector database stored at {db_path}")
32
+
33
+ # Return the components if needed for further processing
34
+ return embedding_components
35
+
36
+ if __name__ == "__main__":
37
+ parser = argparse.ArgumentParser(description="Index documents for ZamAI embeddings")
38
+ parser.add_argument("--corpus", type=str, default="data/text_corpus/",
39
+ help="Path to the directory containing documents to index")
40
+ parser.add_argument("--db", type=str, default=None,
41
+ help="Path to store the ChromaDB database (optional)")
42
+
43
+ args = parser.parse_args()
44
+ index_documents(args.corpus, args.db)