captain-awesome
commited on
Commit
•
d9b4100
1
Parent(s):
96d88aa
Update app.py
Browse files
app.py
CHANGED
@@ -92,4 +92,33 @@ def load_model():
|
|
92 |
# max_new_tokens=max_new_tokens, # type: ignore
|
93 |
# temperature=temperature, # type: ignore
|
94 |
)
|
95 |
-
return llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
# max_new_tokens=max_new_tokens, # type: ignore
|
93 |
# temperature=temperature, # type: ignore
|
94 |
)
|
95 |
+
return llm
|
96 |
+
|
97 |
+
def create_vector_database(loaded_documents):
|
98 |
+
# DB_DIR: str = os.path.join(ABS_PATH, "db")
|
99 |
+
"""
|
100 |
+
Creates a vector database using document loaders and embeddings.
|
101 |
+
This function loads data from PDF, markdown and text files in the 'data/' directory,
|
102 |
+
splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
|
103 |
+
and finally persists the embeddings into a Chroma vector database.
|
104 |
+
"""
|
105 |
+
# Split loaded documents into chunks
|
106 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
|
107 |
+
chunked_documents = text_splitter.split_documents(loaded_documents)
|
108 |
+
|
109 |
+
embeddings = HuggingFaceBgeEmbeddings(
|
110 |
+
model_name = "BAAI/bge-large-en"
|
111 |
+
)
|
112 |
+
|
113 |
+
persist_directory = 'db'
|
114 |
+
# Create and persist a Chroma vector database from the chunked documents
|
115 |
+
db = Chroma.from_documents(
|
116 |
+
documents=chunked_documents,
|
117 |
+
embedding=embeddings,
|
118 |
+
persist_directory=persist_directory
|
119 |
+
# persist_directory=DB_DIR,
|
120 |
+
)
|
121 |
+
db.persist()
|
122 |
+
# db = Chroma(persist_directory=persist_directory,
|
123 |
+
# embedding_function=embedding)
|
124 |
+
return db
|