|
import chromadb
|
|
|
|
import os
|
|
|
|
|
|
from llama_parse import LlamaParse
|
|
from langchain.document_loaders import PyMuPDFLoader
|
|
from langchain.text_splitter import CharacterTextSplitter
|
|
import os
|
|
import pickle
|
|
import nest_asyncio
|
|
|
|
nest_asyncio.apply()
|
|
|
|
path = "mm_vdb2"
|
|
client = chromadb.PersistentClient(path=path)
|
|
|
|
llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ"
|
|
groq_api_key = "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"
|
|
|
|
parsed_data_file = r"parsed_data.pkl"
|
|
output_md = r"output.md"
|
|
loki = r"data"
|
|
|
|
|
|
def load_or_parse_data(loc):
|
|
data_file = parsed_data_file
|
|
|
|
if os.path.exists(data_file):
|
|
|
|
with open(data_file, "rb") as f:
|
|
parsed_data = pickle.load(f)
|
|
else:
|
|
|
|
parsingInstructiontest10k = """The provided document is an user guide or a manual.
|
|
It contains many images and tables.
|
|
Try to be precise while answering the questions"""
|
|
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)
|
|
llama_parse_documents = parser.load_data(loc)
|
|
|
|
|
|
with open(data_file, "wb") as f:
|
|
pickle.dump(llama_parse_documents, f)
|
|
|
|
|
|
parsed_data = llama_parse_documents
|
|
|
|
return parsed_data
|
|
|
|
|
|
|
|
def create_vector_database(loc):
|
|
"""
|
|
Creates a vector database using document loaders and embeddings.
|
|
|
|
This function loads urls,
|
|
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
|
|
and finally persists the embeddings into a Chroma vector database.
|
|
"""
|
|
|
|
data = loc
|
|
loader = PyMuPDFLoader(file_path=data)
|
|
docs = loader.load()
|
|
|
|
print(f"Number of documents: {len(docs)}")
|
|
|
|
print("Vector DB started!")
|
|
|
|
|
|
document_contents = []
|
|
ids = []
|
|
|
|
|
|
for i, doc in enumerate(docs):
|
|
|
|
print(f"Metadata for document {i+1}: {doc.metadata}")
|
|
|
|
|
|
page_num = doc.metadata.get('page_number', f'unknown_{i+1}')
|
|
|
|
|
|
page_content = doc.page_content
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
|
|
doc_chunks = text_splitter.split_text(page_content)
|
|
|
|
|
|
for chunk_idx, chunk in enumerate(doc_chunks):
|
|
document_contents.append(chunk)
|
|
ids.append(f"page_{page_num}_chunk_{i+1}_{chunk_idx+1}")
|
|
|
|
|
|
assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents"
|
|
|
|
|
|
text_collection = client.get_or_create_collection(name="text_collection")
|
|
|
|
|
|
text_collection.add(
|
|
documents=document_contents,
|
|
ids=ids
|
|
)
|
|
|
|
print('Vector DB created successfully!')
|
|
return text_collection
|
|
|