MonPol / filterminutes.py
ruisp's picture
Some frontend cosmetics and refactoring.
42a39da
raw
history blame
3.12 kB
import logging
import gradio as gr
import numpy as np
log = logging.getLogger('filter methods')
logging.basicConfig(level=logging.INFO)
def filter_docs_by_meta(docs, filter_dict):
"""
Filter documents by multiple parameters
Parameters:
docs : List[langchain.schema.Document]
filter_dict : Dict[str, Any]
Returns: List of filtered documents
Examples:
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
filter_dict = {'a': 1, 'b': 2}
filter_docs_by_meta(docs, filter_dict)
[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]
"""
filtered_docs = []
for doc in docs:
append = True
for key, value in filter_dict.items():
if doc.metadata[key] != int(value):
append = False
break
if append:
filtered_docs.append(doc)
return filtered_docs
def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50):
"""
Expand search with filter until reaching at least a pre-determined number of documents.
----------
Parameters
vector_store : langchain.vectorstores.FAISS
The FAISS vector store.
query : str
The query to search for.
filter_dict : Dict[str, Any]
The parameters to filer for
target_k : int
The minimum number of documents desired after filtering
init_k : int
The top-k documents to extract for the initial search.
step : int
The size of the step when enlarging the search.
Returns: List of at least target_k Documents for post-processing.
"""
context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
len_docs_begin = len(context)
if len_docs_begin >= target_k:
log.info(f'Initial search contains {len_docs_begin} documents. Expansion not required. ')
return context
MAX_K = 50000 # This is more than the number of actual documents.
for top_k_docs in np.arange(init_k, MAX_K, step):
log.info(f'Context contains {len(context)} documents')
log.info(f'Expanding search with k={top_k_docs}')
context = filter_docs_by_meta(vector_store.similarity_search(query, k=int(top_k_docs)), filter_dict)
if len(context) >= target_k:
log.info(f'Success. Context contains {len(context)} documents matching the filtering criteria')
return context
log.info(f'Failed to reach target number of documents,'
f' context contains {len(context)} documents matching the filtering criteria')
return context