Spaces:
Configuration error
Configuration error
from pymilvus import MilvusClient | |
# from langchain_milvus import Milvus | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.docstore.document import Document | |
from tqdm import tqdm | |
from dotenv import load_dotenv | |
from typing import List | |
import os | |
load_dotenv() | |
embeddings = OpenAIEmbeddings(openai_api_key="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") | |
from typing import List | |
# =============Fais Setup============# | |
from typing import List | |
from langchain_core.documents import Document | |
import faiss | |
from langchain_community.docstore.in_memory import InMemoryDocstore | |
from langchain_community.vectorstores import FAISS | |
from uuid import uuid4 | |
from tqdm import tqdm | |
def add_to_vector_store(docs_chunks: List[Document],batch_size:int = 64,vector_store_path = "my_faiss_index"): | |
""" | |
Embeds document chunks and stores them in a FAISS vector store. | |
Args: | |
docs_chunks (List[Document]): List of LangChain Document objects. | |
Returns: | |
dict: Status message and vector store. | |
""" | |
print(f">> Starting embedding for {len(docs_chunks)} documents...\n") | |
if os.path.exists(vector_store_path): | |
print(">> Loading the index <<") | |
vector_store = FAISS.load_local(vector_store_path, embeddings,allow_dangerous_deserialization=True) | |
else: | |
print(">> Creating the index <<") | |
# Create an index using the dimensionality of one sample embedding | |
dimension = len(embeddings.embed_query("hello world")) | |
index = faiss.IndexFlatL2(dimension) | |
# Initialize vector store | |
vector_store = FAISS( | |
embedding_function=embeddings, | |
index=index, | |
docstore=InMemoryDocstore(), | |
index_to_docstore_id={}, | |
) | |
# Generate unique IDs for documents | |
uuids = [str(uuid4()) for _ in docs_chunks] | |
print(f"\nπ¦ Preparing to insert {len(docs_chunks)} documents into FAISS...\n") | |
# Loop over documents in batches | |
for i in tqdm(range(0, len(docs_chunks), batch_size), desc="π Embedding & Inserting", unit="batch"): | |
batch_docs = docs_chunks[i:i+batch_size] | |
batch_ids = uuids[i:i+batch_size] | |
vector_store.add_documents(documents=batch_docs, ids=batch_ids) | |
vector_store.save_local(vector_store_path) | |
print("β Data insertion successful!\n") | |
return { | |
"status": "success", | |
"vector_store": vector_store, | |
"num_documents": len(docs_chunks) | |
} | |
def GetContext(query:str): | |
vector_store = FAISS.load_local("my_faiss_index", embeddings,allow_dangerous_deserialization=True) | |
results = vector_store.similarity_search( | |
query, | |
k=2, | |
# filter={"source": "tweet"}, | |
) | |
# for res in results: | |
# print(f"* {res.page_content} [{res.metadata}]") | |
return {"Context":results} | |
if __name__ == "__main__": | |
pass | |