Spaces:

divy131
/

legalchatbotD

Sleeping

divy131 commited on Apr 9, 2024

Commit

8df2836

verified ·

1 Parent(s): ea458b4

Upload ingest (1).py

Files changed (1) hide show

ingest (1).py ADDED Viewed

+# Importing Dependencies
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+# Dataset Directory Path
+DATASET = "dataset/"
+# Faiss Index Path
+FAISS_INDEX = "vectorstore/"
+# Create Vector Store and Index
+def embed_all():
+    """
+    Embed all files in the dataset directory
+    """
+    # Create the document loader
+    loader = DirectoryLoader(DATASET, glob="*.pdf", loader_cls=PyPDFLoader)
+    # Load the documents
+    documents = loader.load()
+    # Create the splitter
+    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
+    # Split the documents into chunks
+    chunks = splitter.split_documents(documents)
+    # Load the embeddings
+    embeddings = HuggingFaceEmbeddings()
+    # Create the vector store
+    vector_store = FAISS.from_documents(chunks, embeddings)
+    # Save the vector store
+    vector_store.save_local(FAISS_INDEX)
+if __name__ == "__main__":
+    embed_all()