import chromadb
import os
from llama_parse import LlamaParse
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import pickle
import nest_asyncio
path = "mm_vdb2"
client = chromadb.PersistentClient(path=path)
llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ"
groq_api_key = "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"
parsed_data_file = r"parsed_data.pkl"
output_md = r"output.md"
loki = r"data"
def load_or_parse_data(loc):
data_file = parsed_data_file
if os.path.exists(data_file):
with open(data_file, "rb") as f:
parsed_data = pickle.load(f)
parsingInstructiontest10k = """The provided document is an user guide or a manual.
It contains many images and tables.
Try to be precise while answering the questions"""
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)
llama_parse_documents = parser.load_data(loc)
with open(data_file, "wb") as f:
pickle.dump(llama_parse_documents, f)
parsed_data = llama_parse_documents
return parsed_data
def create_vector_database(loc):
Creates a vector database using document loaders and embeddings.
This function loads urls,
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
and finally persists the embeddings into a Chroma vector database.
data = loc
loader = PyMuPDFLoader(file_path=data)
docs = loader.load()
print(f"Number of documents: {len(docs)}")
print("Vector DB started!")
document_contents = []
ids = []
for i, doc in enumerate(docs):
print(f"Metadata for document {i+1}: {doc.metadata}")
page_num = doc.metadata.get('page_number', f'unknown_{i+1}')
page_content = doc.page_content
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
doc_chunks = text_splitter.split_text(page_content)
for chunk_idx, chunk in enumerate(doc_chunks):
assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents"
text_collection = client.get_or_create_collection(name="text_collection")
print('Vector DB created successfully!')
return text_collection