Spaces:

thejagstudio
/

narayangpt

Sleeping

App Files Files Community

narayangpt / databaseCreator.py

thejagstudio

Upload 13 files

ba5136e verified over 1 year ago

raw

history blame contribute delete

5.09 kB

	import argparse
	import os
	import shutil
	from langchain_community.document_loaders import PyPDFDirectoryLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.schema.document import Document
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings.bedrock import BedrockEmbeddings
	import json
	import requests
	from chromadb import Documents, EmbeddingFunction, Embeddings

	CHROMA_PATH = "chroma"
	DATA_PATH = "pdfs"


	class MyEmbeddingFunction(EmbeddingFunction):

	def embed_documents(self, input: Documents) -> Embeddings:
	for i in range(5):
	try:
	embeddings = []
	url = "https://api.deepinfra.com/v1/inference/BAAI/bge-large-en-v1.5"

	payload = json.dumps({
	"inputs": input
	})
	headers = {
	'Accept': 'application/json, text/plain, /',
	'Accept-Language': 'en-US,en;q=0.9,gu;q=0.8,ru;q=0.7,hi;q=0.6',
	'Connection': 'keep-alive',
	'Content-Type': 'application/json',
	'Origin': 'https://deepinfra.com',
	'Referer': 'https://deepinfra.com/',
	'Sec-Fetch-Dest': 'empty',
	'Sec-Fetch-Mode': 'cors',
	'Sec-Fetch-Site': 'same-site',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
	'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"Windows"'
	}

	response = requests.request("POST", url, headers=headers, data=payload)
	return response.json()["embeddings"]
	except:
	pass


	def main():

	# Check if the database should be cleared (using the --clear flag).
	parser = argparse.ArgumentParser()
	parser.add_argument("--reset", action="store_true", help="Reset the database.")
	args = parser.parse_args()
	if args.reset:
	print("✨ Clearing Database")
	clear_database()

	# Create (or update) the data store.
	documents = load_documents()
	chunks = split_documents(documents)
	add_to_chroma(chunks)


	def load_documents():
	print("📚 Loading Documents")
	document_loader = PyPDFDirectoryLoader(DATA_PATH)
	return document_loader.load()


	def split_documents(documents: list[Document]):
	print("🔪 Splitting Documents")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=4000,
	chunk_overlap=100,
	length_function=len,
	is_separator_regex=True
	)
	return text_splitter.split_documents(documents)


	def add_to_chroma(chunks: list[Document]):
	print("🔗 Adding to Chroma")
	# Load the existing database.
	custom_embeddings = MyEmbeddingFunction()
	db = Chroma(
	persist_directory=CHROMA_PATH, embedding_function=custom_embeddings
	)

	# Calculate Page IDs.
	chunks_with_ids = calculate_chunk_ids(chunks)

	# Add or Update the documents.
	existing_items = db.get(include=[]) # IDs are always included by default
	existing_ids = set(existing_items["ids"])
	print(f"Number of existing documents in DB: {len(existing_ids)}")

	# Only add documents that don't exist in the DB.
	new_chunks = []
	for chunk in chunks_with_ids:
	if chunk.metadata["id"] not in existing_ids:
	new_chunks.append(chunk)

	if len(new_chunks):
	print(f"👉 Adding new documents: {len(new_chunks)}")
	new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
	for i in range(0, len(new_chunks), 100):
	try:
	db.add_documents(new_chunks[i:i+100], ids=new_chunk_ids[i:i+100])
	db.persist()
	print(f"Added {i+100} documents")
	except:
	pass
	else:
	print("✅ No new documents to add")


	def calculate_chunk_ids(chunks):

	last_page_id = None
	current_chunk_index = 0

	for chunk in chunks:
	source = chunk.metadata.get("source")
	page = chunk.metadata.get("page")
	current_page_id = f"{source}:{page}"

	# If the page ID is the same as the last one, increment the index.
	if current_page_id == last_page_id:
	current_chunk_index += 1
	else:
	current_chunk_index = 0

	# Calculate the chunk ID.
	chunk_id = f"{current_page_id}:{current_chunk_index}"
	last_page_id = current_page_id

	# Add it to the page meta-data.
	chunk.metadata["id"] = chunk_id

	return chunks


	def clear_database():
	if os.path.exists(CHROMA_PATH):
	shutil.rmtree(CHROMA_PATH)


	if __name__ == "__main__":
	main()