Spaces:
Sleeping
Sleeping
from typing import List, Type | |
from langchain.docstore.document import Document | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.embeddings.base import Embeddings | |
from langchain.vectorstores import VectorStore | |
from langchain.vectorstores.faiss import FAISS | |
from .debug import FakeEmbeddings, FakeVectorStore | |
from .parsing import File | |
class FolderIndex: | |
"""Index for a collection of files (a folder)""" | |
def __init__(self, files: List[File], index: VectorStore): | |
self.name: str = "default" | |
self.files = files | |
self.index: VectorStore = index | |
def _combine_files(files: List[File]) -> List[Document]: | |
"""Combines all the documents in a list of files into a single list.""" | |
all_texts = [] | |
for file in files: | |
for doc in file.docs: | |
doc.metadata["file_name"] = file.name | |
doc.metadata["file_id"] = file.id | |
all_texts.append(doc) | |
return all_texts | |
def from_files( | |
cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore] | |
) -> "FolderIndex": | |
"""Creates an index from files.""" | |
all_docs = cls._combine_files(files) | |
index = vector_store.from_documents( | |
documents=all_docs, | |
embedding=embeddings, | |
) | |
return cls(files=files, index=index) | |
def embed_files( | |
files: List[File], embedding: str, vector_store: str, **kwargs | |
) -> FolderIndex: | |
model_name = "BAAI/bge-small-en" | |
model_kwargs = {'device': 'mps'} | |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity | |
model_norm = HuggingFaceBgeEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs | |
) | |
# embeddings = OpenAIEmbeddings | |
embeddings = model_norm | |
return FolderIndex.from_files( | |
files=files, embeddings=embeddings, vector_store=FAISS | |
) | |