Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import UnstructuredFileLoader | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_postgres.vectorstores import PGVector | |
| # Loading the embedding model | |
| embeddings = HuggingFaceEmbeddings() | |
| # Define a function to perform vectorization | |
| def vectorize_documents(): | |
| try: | |
| # Loading the embedding model | |
| loader = DirectoryLoader( | |
| path="Data", | |
| glob="./*.pdf", | |
| loader_cls=UnstructuredFileLoader | |
| ) | |
| documents = loader.load() | |
| if not documents: | |
| print("No documents found in the specified directory.") | |
| return | |
| # Splitting the text and creating chunks of these documents. | |
| text_splitter = CharacterTextSplitter( | |
| chunk_size=2000, | |
| chunk_overlap=500 | |
| ) | |
| text_chunks = text_splitter.split_documents(documents) | |
| # Storing in PostgreSQL - PGVector | |
| connection_string = "postgresql+psycopg2://postgres:krishna23@localhost:5432/vector_db" | |
| collection_name = "whatsapp_chatbot" | |
| # Create a PGVector instance and store the documents | |
| vector_store = PGVector.from_documents( | |
| embedding=embeddings, | |
| documents=text_chunks, | |
| collection_name=collection_name, | |
| connection=connection_string, | |
| ) | |
| print("Documents vectorized successfully and stored in PGVector.") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Main guard to prevent execution on import | |
| if __name__ == "__main__": | |
| vectorize_documents() | |