Spaces:

Krish30
/

whatsapp_chatbot_pgvector

Sleeping

whatsapp_chatbot_pgvector / vectorize_data_pgvector.py

Upload 2 files

64d661c verified about 1 year ago

1.82 kB

	from langchain_community.document_loaders import UnstructuredFileLoader
	from langchain_community.document_loaders import DirectoryLoader
	from langchain_text_splitters import CharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_postgres.vectorstores import PGVector

	# Loading the embedding model
	embeddings = HuggingFaceEmbeddings()

	# Define a function to perform vectorization
	def vectorize_documents():
	try:
	# Loading the embedding model


	loader = DirectoryLoader(
	path="Data",
	glob="./*.pdf",
	loader_cls=UnstructuredFileLoader
	)

	documents = loader.load()

	if not documents:
	print("No documents found in the specified directory.")
	return

	# Splitting the text and creating chunks of these documents.
	text_splitter = CharacterTextSplitter(
	chunk_size=2000,
	chunk_overlap=500
	)

	text_chunks = text_splitter.split_documents(documents)

	# Storing in PostgreSQL - PGVector
	connection_string = "postgresql+psycopg2://postgres:krishna23@localhost:5432/vector_db"

	collection_name = "whatsapp_chatbot"

	# Create a PGVector instance and store the documents
	vector_store = PGVector.from_documents(
	embedding=embeddings,
	documents=text_chunks,
	collection_name=collection_name,
	connection=connection_string,
	)

	print("Documents vectorized successfully and stored in PGVector.")

	except Exception as e:
	print(f"An error occurred: {e}")

	# Main guard to prevent execution on import
	if __name__ == "__main__":
	vectorize_documents()