Spaces:
Runtime error
Runtime error
import logging | |
import gradio as gr | |
import numpy as np | |
log = logging.getLogger('filter methods') | |
logging.basicConfig(level=logging.INFO) | |
def filter_docs_by_meta(docs, filter_dict): | |
""" | |
Filter documents by multiple parameters | |
Parameters: | |
docs : List[langchain.schema.Document] | |
filter_dict : Dict[str, Any] | |
Returns: List of filtered documents | |
Examples: | |
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') | |
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] | |
filter_dict = {'a': 1} | |
filter_docs_by_meta(docs, filter_dict) | |
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] | |
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') | |
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] | |
filter_dict = {'a': 1, 'b': 2} | |
filter_docs_by_meta(docs, filter_dict) | |
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] | |
""" | |
filtered_docs = [] | |
for doc in docs: | |
append = True | |
for key, value in filter_dict.items(): | |
if doc.metadata[key] != int(value): | |
append = False | |
break | |
if append: | |
filtered_docs.append(doc) | |
return filtered_docs | |
def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50): | |
""" | |
Expand search with filter until reaching at least a pre-determined number of documents. | |
---------- | |
Parameters | |
vector_store : langchain.vectorstores.FAISS | |
The FAISS vector store. | |
query : str | |
The query to search for. | |
filter_dict : Dict[str, Any] | |
The parameters to filer for | |
target_k : int | |
The minimum number of documents desired after filtering | |
init_k : int | |
The top-k documents to extract for the initial search. | |
step : int | |
The size of the step when enlarging the search. | |
Returns: List of at least target_k Documents for post-processing. | |
""" | |
context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict) | |
len_docs_begin = len(context) | |
if len_docs_begin >= target_k: | |
log.info(f'Initial search contains {len_docs_begin} documents. Expansion not required. ') | |
return context | |
MAX_K = 50000 # This is more than the number of actual documents. | |
for top_k_docs in np.arange(init_k, MAX_K, step): | |
log.info(f'Context contains {len(context)} documents') | |
log.info(f'Expanding search with k={top_k_docs}') | |
context = filter_docs_by_meta(vector_store.similarity_search(query, k=int(top_k_docs)), filter_dict) | |
if len(context) >= target_k: | |
log.info(f'Success. Context contains {len(context)} documents matching the filtering criteria') | |
return context | |
log.info(f'Failed to reach target number of documents,' | |
f' context contains {len(context)} documents matching the filtering criteria') | |
return context | |