import logging log = logging.getLogger('filter methods') logging.basicConfig(level=logging.INFO) def filter_docs_by_meta(docs, filter_dict): """ Filter documents by multiple parameters Parameters: docs : List[langchain.schema.Document] filter_dict : Dict[str, Any] Returns: List of filtered documents Examples: docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] filter_dict = {'a': 1} filter_docs_by_meta(docs, filter_dict) [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] filter_dict = {'a': 1, 'b': 2} filter_docs_by_meta(docs, filter_dict) [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] """ filtered_docs = [] for doc in docs: append = True for key, value in filter_dict.items(): if doc.metadata[key] != value: append = False break if append: filtered_docs.append(doc) return filtered_docs def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50): """ Expand search with filter until reaching at least a pre-determined number of documents. ---------- Parameters vector_store : langchain.vectorstores.FAISS The FAISS vector store. query : str The query to search for. filter_dict : Dict[str, Any] The parameters to filer for target_k : int The minimum number of documents desired after filtering init_k : int The top-k documents to extract for the initial search. step : int The size of the step when enlarging the search. Returns: List of at least target_k Documents for post-processing """ context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict) while len(context) < target_k: log.info(f'Context contains {len(context)} documents') log.info(f'Expanding search with k={init_k}') init_k += step context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict) log.info(f'Done. Context contains {len(context)} Documents matching the filtering criteria') return context