import streamlit as st from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_community.vectorstores.utils import DistanceStrategy def load_bge_embeddings(): model_name = "BAAI/bge-small-en-v1.5" model_kwargs = {"device": "cpu"} encode_kwargs = {"normalize_embeddings": True} emb_fn = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Represent this question for searching relevant passages: ", ) return emb_fn def load_pinecone_vectorstore(): emb_fn = load_bge_embeddings() vectorstore = PineconeVectorStore( embedding=emb_fn, text_key="text", distance_strategy=DistanceStrategy.COSINE, pinecone_api_key=st.secrets["pinecone_api_key"], index_name=st.secrets["pinecone_index_name"], ) return vectorstore def get_vectorstore_filter(ret_config: dict) -> dict: vs_filter = {} if ret_config["filter_legis_id"] != "": vs_filter["legis_id"] = ret_config["filter_legis_id"] if ret_config["filter_bioguide_id"] != "": vs_filter["sponsor_bioguide_id"] = ret_config["filter_bioguide_id"] vs_filter = { **vs_filter, "congress_num": {"$in": ret_config["filter_congress_nums"]}, } vs_filter = { **vs_filter, "sponsor_party": {"$in": ret_config["filter_sponsor_parties"]}, } return vs_filter