Titan / utils /ingest_text.py
NEXAS's picture
Upload 9 files
03d82bf verified
raw
history blame
3.98 kB
import chromadb
import os
# Ingest Text
from llama_parse import LlamaParse
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import os
import pickle
import nest_asyncio
nest_asyncio.apply()
path = "mm_vdb2"
client = chromadb.PersistentClient(path=path)
llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ"
groq_api_key = "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"
parsed_data_file = r"parsed_data.pkl"
output_md = r"output.md"
loki = r"data"
# Define a function to load parsed data if available, or parse if not
def load_or_parse_data(loc):
data_file = parsed_data_file
if os.path.exists(data_file):
# Load the parsed data from the file
with open(data_file, "rb") as f:
parsed_data = pickle.load(f)
else:
# Perform the parsing step and store the result in llama_parse_documents
parsingInstructiontest10k = """The provided document is an user guide or a manual.
It contains many images and tables.
Try to be precise while answering the questions"""
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
llama_parse_documents = parser.load_data(loc)
# Save the parsed data to a file
with open(data_file, "wb") as f:
pickle.dump(llama_parse_documents, f)
# Set the parsed data to the variable
parsed_data = llama_parse_documents
return parsed_data
# Create vector database
def create_vector_database(loc):
"""
Creates a vector database using document loaders and embeddings.
This function loads urls,
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
and finally persists the embeddings into a Chroma vector database.
"""
# Call the function to either load or parse the data
data = loc
loader = PyMuPDFLoader(file_path=data)
docs = loader.load() # This returns a list of pages/documents
print(f"Number of documents: {len(docs)}")
print("Vector DB started!")
# Initialize a list for document content and IDs
document_contents = []
ids = []
# Generate unique IDs for each document, with PDF page number first
for i, doc in enumerate(docs):
# Print metadata to understand its structure
print(f"Metadata for document {i+1}: {doc.metadata}")
# Try to extract the page number from metadata or use a default
page_num = doc.metadata.get('page_number', f'unknown_{i+1}') # Use i+1 to ensure uniqueness
# Extract text from each page
page_content = doc.page_content # Get the content of the page
# Split the content into chunks based on the text splitter
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
doc_chunks = text_splitter.split_text(page_content)
# Add chunk contents and corresponding page-based IDs
for chunk_idx, chunk in enumerate(doc_chunks):
document_contents.append(chunk) # Add the chunk content
ids.append(f"page_{page_num}_chunk_{i+1}_{chunk_idx+1}") # Add a unique chunk ID
# Ensure the number of ids matches the number of documents (contents)
assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents"
# Create or get the text collection
text_collection = client.get_or_create_collection(name="text_collection")
# Add documents and their embeddings to the collection
text_collection.add(
documents=document_contents, # All the chunk-level content
ids=ids # Matching IDs for each chunk content
)
print('Vector DB created successfully!')
return text_collection