Spaces:
Runtime error
Runtime error
# import hashlib | |
from langchain.vectorstores.faiss import * | |
from langchain.vectorstores.faiss import FAISS as OriginalFAISS | |
from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore | |
class FAISS(OriginalFAISS): | |
def __add( | |
self, | |
texts: Iterable[str], | |
embeddings: Iterable[List[float]], | |
metadatas: Optional[List[dict]] = None, | |
**kwargs: Any, | |
) -> List[str]: | |
if not isinstance(self.docstore, AddableMixin): | |
raise ValueError( | |
"If trying to add texts, the underlying docstore should support " | |
f"adding items, which {self.docstore} does not" | |
) | |
documents = [] | |
for i, text in enumerate(texts): | |
metadata = metadatas[i] if metadatas else {} | |
documents.append(Document(page_content=text, metadata=metadata)) | |
# Add to the index, the index_to_id mapping, and the docstore. | |
starting_len = len(self.index_to_docstore_id) | |
self.index.add(np.array(embeddings, dtype=np.float32)) | |
# Get list of index, id, and docs. | |
full_info = [ | |
(starting_len + i, str(uuid.uuid4()), doc) | |
for i, doc in enumerate(documents) | |
] | |
# Add information to docstore and index. | |
self.docstore.add({_id: doc for _, _id, doc in full_info}) | |
index_to_id = {index: _id for index, _id, _ in full_info} | |
self.index_to_docstore_id.update(index_to_id) | |
return [_id for _, _id, _ in full_info] | |
def __from( | |
cls, | |
texts: List[str], | |
embeddings: List[List[float]], | |
embedding: Embeddings, | |
metadatas: Optional[List[dict]] = None, | |
**kwargs: Any, | |
) -> FAISS: | |
faiss = dependable_faiss_import() | |
index = faiss.IndexFlatL2(len(embeddings[0])) | |
index.add(np.array(embeddings, dtype=np.float32)) | |
documents = [] | |
for i, text in enumerate(texts): | |
metadata = metadatas[i] if metadatas else {} | |
documents.append(Document(page_content=text, metadata=metadata)) | |
index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))} | |
# # TODO: cambiar para usar el hash. Y ver donde se pondria para que no cargara el chunk en el dataset | |
# index_to_id_2 = dict() | |
# for i in range(len(documents)): | |
# h = hashlib.new('sha256') | |
# text_ = documents[i].page_content | |
# h.update(text_.encode()) | |
# index_to_id_2[i] = str(h.hexdigest()) | |
# # | |
docstore = InMemoryDocstore( | |
{index_to_id[i]: doc for i, doc in enumerate(documents)} | |
) | |
return cls(embedding.embed_query, index, docstore, index_to_id) | |
def from_texts( | |
cls, | |
texts: List[str], | |
embedding: Embeddings, | |
metadatas: Optional[List[dict]] = None, | |
**kwargs: Any, | |
) -> FAISS: | |
"""Construct FAISS wrapper from raw documents. | |
This is a user friendly interface that: | |
1. Embeds documents. | |
2. Creates an in memory docstore | |
3. Initializes the FAISS database | |
This is intended to be a quick way to get started. | |
Example: | |
.. code-block:: python | |
from langchain import FAISS | |
from langchain.embeddings import OpenAIEmbeddings | |
embeddings = OpenAIEmbeddings() | |
faiss = FAISS.from_texts(texts, embeddings) | |
""" | |
# embeddings = embedding.embed_documents(texts) | |
print(f"len(texts): {len(texts)}") # TODO: borrar | |
embeddings = [embedding.embed_documents([text])[0] for text in texts] | |
print(f"len(embeddings): {len(embeddings)}") # TODO: borrar | |
return cls.__from(texts, embeddings, embedding, metadatas, **kwargs) | |