Spaces:

captain-awesome
/

Docuverse-zephyr-beta

Sleeping

captain-awesome commited on Nov 7, 2023

Commit

d9b4100

•

1 Parent(s): 96d88aa

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -92,4 +92,33 @@ def load_model():
         # max_new_tokens=max_new_tokens,  # type: ignore
         # temperature=temperature,  # type: ignore
     )
-    return llm

         # max_new_tokens=max_new_tokens,  # type: ignore
         # temperature=temperature,  # type: ignore
     )
+    return llm
+def create_vector_database(loaded_documents):
+    # DB_DIR: str = os.path.join(ABS_PATH, "db")
+    """
+    Creates a vector database using document loaders and embeddings.
+    This function loads data from PDF, markdown and text files in the 'data/' directory,
+    splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
+    and finally persists the embeddings into a Chroma vector database.
+    """
+    # Split loaded documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
+    chunked_documents = text_splitter.split_documents(loaded_documents)
+    embeddings = HuggingFaceBgeEmbeddings(
+        model_name = "BAAI/bge-large-en"
+    )
+    persist_directory = 'db'
+    # Create and persist a Chroma vector database from the chunked documents
+    db = Chroma.from_documents(
+        documents=chunked_documents,
+        embedding=embeddings,
+        persist_directory=persist_directory
+        # persist_directory=DB_DIR,
+    )
+    db.persist()
+    # db = Chroma(persist_directory=persist_directory,
+    #               embedding_function=embedding)
+    return db