Spaces:

NEXAS
/

Titan

Sleeping

App Files Files Community

Titan / utils /ingest_text.py

NEXAS

Upload 9 files

03d82bf verified 2 months ago

raw

history blame

3.98 kB

	import chromadb

	import os

	# Ingest Text
	from llama_parse import LlamaParse
	from langchain.document_loaders import PyMuPDFLoader
	from langchain.text_splitter import CharacterTextSplitter
	import os
	import pickle
	import nest_asyncio

	nest_asyncio.apply()

	path = "mm_vdb2"
	client = chromadb.PersistentClient(path=path)

	llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ"
	groq_api_key = "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"

	parsed_data_file = r"parsed_data.pkl"
	output_md = r"output.md"
	loki = r"data"

	# Define a function to load parsed data if available, or parse if not
	def load_or_parse_data(loc):
	data_file = parsed_data_file

	if os.path.exists(data_file):
	# Load the parsed data from the file
	with open(data_file, "rb") as f:
	parsed_data = pickle.load(f)
	else:
	# Perform the parsing step and store the result in llama_parse_documents
	parsingInstructiontest10k = """The provided document is an user guide or a manual.
	It contains many images and tables.
	Try to be precise while answering the questions"""
	parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
	llama_parse_documents = parser.load_data(loc)

	# Save the parsed data to a file
	with open(data_file, "wb") as f:
	pickle.dump(llama_parse_documents, f)

	# Set the parsed data to the variable
	parsed_data = llama_parse_documents

	return parsed_data


	# Create vector database
	def create_vector_database(loc):
	"""
	Creates a vector database using document loaders and embeddings.

	This function loads urls,
	splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
	and finally persists the embeddings into a Chroma vector database.
	"""
	# Call the function to either load or parse the data
	data = loc
	loader = PyMuPDFLoader(file_path=data)
	docs = loader.load() # This returns a list of pages/documents

	print(f"Number of documents: {len(docs)}")

	print("Vector DB started!")

	# Initialize a list for document content and IDs
	document_contents = []
	ids = []

	# Generate unique IDs for each document, with PDF page number first
	for i, doc in enumerate(docs):
	# Print metadata to understand its structure
	print(f"Metadata for document {i+1}: {doc.metadata}")

	# Try to extract the page number from metadata or use a default
	page_num = doc.metadata.get('page_number', f'unknown_{i+1}') # Use i+1 to ensure uniqueness

	# Extract text from each page
	page_content = doc.page_content # Get the content of the page

	# Split the content into chunks based on the text splitter
	text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
	doc_chunks = text_splitter.split_text(page_content)

	# Add chunk contents and corresponding page-based IDs
	for chunk_idx, chunk in enumerate(doc_chunks):
	document_contents.append(chunk) # Add the chunk content
	ids.append(f"page_{page_num}_chunk_{i+1}_{chunk_idx+1}") # Add a unique chunk ID

	# Ensure the number of ids matches the number of documents (contents)
	assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents"

	# Create or get the text collection
	text_collection = client.get_or_create_collection(name="text_collection")

	# Add documents and their embeddings to the collection
	text_collection.add(
	documents=document_contents, # All the chunk-level content
	ids=ids # Matching IDs for each chunk content
	)

	print('Vector DB created successfully!')
	return text_collection