Spaces:

hyperdemocracy
/

hf-legisqa

Running

App Files Files Community

gabrielaltay commited on Aug 4, 2024

Commit

76cbdff

1 Parent(s): 793d0f2

more files

Browse files

Files changed (8) hide show

app.py +27 -231
doc_format_mod.py +102 -0
guide_mod.py +22 -0
retriever_tools.py +0 -79
sidebar_mod.py +20 -0
usage.py → usage_mod.py +18 -18
utils_mod.py +47 -0
vectorstore_mod.py +46 -0

app.py CHANGED Viewed

@@ -10,15 +10,17 @@ from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain_core.runnables import RunnablePassthrough
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_together import ChatTogether
-from langchain_pinecone import PineconeVectorStore
 import streamlit as st
-import usage
 st.set_page_config(layout="wide", page_title="LegisQA")
@@ -32,16 +34,7 @@ SS = st.session_state
 SEED = 292764
 CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118]
 SPONSOR_PARTIES = ["D", "R", "L", "I"]
-CONGRESS_GOV_TYPE_MAP = {
-    "hconres": "house-concurrent-resolution",
-    "hjres": "house-joint-resolution",
-    "hr": "house-bill",
-    "hres": "house-resolution",
-    "s": "senate-bill",
-    "sconres": "senate-concurrent-resolution",
-    "sjres": "senate-joint-resolution",
-    "sres": "senate-resolution",
-}
 OPENAI_CHAT_MODELS = {
     "gpt-4o-mini": {"cost": {"pmi": 0.15, "pmo": 0.60}},
     "gpt-4o": {"cost": {"pmi": 5.00, "pmo": 15.0}},
@@ -68,190 +61,6 @@ PROVIDER_MODELS = {
 }
-def get_sponsor_url(bioguide_id: str) -> str:
-    return f"https://bioguide.congress.gov/search/bio/{bioguide_id}"
-def get_congress_gov_url(congress_num: int, legis_type: str, legis_num: int) -> str:
-    lt = CONGRESS_GOV_TYPE_MAP[legis_type]
-    return f"https://www.congress.gov/bill/{int(congress_num)}th-congress/{lt}/{int(legis_num)}"
-def load_bge_embeddings():
-    model_name = "BAAI/bge-small-en-v1.5"
-    model_kwargs = {"device": "cpu"}
-    encode_kwargs = {"normalize_embeddings": True}
-    emb_fn = HuggingFaceBgeEmbeddings(
-        model_name=model_name,
-        model_kwargs=model_kwargs,
-        encode_kwargs=encode_kwargs,
-        query_instruction="Represent this question for searching relevant passages: ",
-    )
-    return emb_fn
-def load_pinecone_vectorstore():
-    emb_fn = load_bge_embeddings()
-    vectorstore = PineconeVectorStore(
-        embedding=emb_fn,
-        text_key="text",
-        distance_strategy=DistanceStrategy.COSINE,
-        pinecone_api_key=st.secrets["pinecone_api_key"],
-        index_name=st.secrets["pinecone_index_name"],
-    )
-    return vectorstore
-def render_outreach_links():
-    nomic_base_url = "https://atlas.nomic.ai/data/gabrielhyperdemocracy"
-    nomic_map_name = "us-congressional-legislation-s1024o256nomic-1"
-    nomic_url = f"{nomic_base_url}/{nomic_map_name}/map"
-    hf_url = "https://huggingface.co/hyperdemocracy"
-    pc_url = "https://www.pinecone.io/blog/serverless"
-    together_url = "https://www.together.ai/"
-    st.subheader(":brain: About [hyperdemocracy](https://hyperdemocracy.us)")
-    st.subheader(f":world_map: Visualize [nomic atlas]({nomic_url})")
-    st.subheader(f":hugging_face: Raw [huggingface datasets]({hf_url})")
-    st.subheader(f":evergreen_tree: Index [pinecone serverless]({pc_url})")
-    st.subheader(f":pancakes: Inference [together.ai]({together_url})")
-def render_sidebar():
-    with st.container(border=True):
-        render_outreach_links()
-def group_docs(docs) -> list[tuple[str, list[Document]]]:
-    doc_grps = defaultdict(list)
-    # create legis_id groups
-    for doc in docs:
-        doc_grps[doc.metadata["legis_id"]].append(doc)
-    # sort docs in each group by start index
-    for legis_id in doc_grps.keys():
-        doc_grps[legis_id] = sorted(
-            doc_grps[legis_id],
-            key=lambda x: x.metadata["start_index"],
-        )
-    # sort groups by number of docs
-    doc_grps = sorted(
-        tuple(doc_grps.items()),
-        key=lambda x: -len(x[1]),
-    )
-    return doc_grps
-def format_docs(docs: list[Document]) -> str:
-    """JSON grouped"""
-    doc_grps = group_docs(docs)
-    out = []
-    for legis_id, doc_grp in doc_grps:
-        dd = {
-            "legis_id": doc_grp[0].metadata["legis_id"],
-            "title": doc_grp[0].metadata["title"],
-            "introduced_date": doc_grp[0].metadata["introduced_date"],
-            "sponsor": doc_grp[0].metadata["sponsor_full_name"],
-            "snippets": [doc.page_content for doc in doc_grp],
-        }
-        out.append(dd)
-    return json.dumps(out, indent=4)
-def escape_markdown(text: str) -> str:
-    MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
-    for char in MD_SPECIAL_CHARS:
-        text = text.replace(char, "\\" + char)
-    return text
-def get_vectorstore_filter(ret_config: dict) -> dict:
-    vs_filter = {}
-    if ret_config["filter_legis_id"] != "":
-        vs_filter["legis_id"] = ret_config["filter_legis_id"]
-    if ret_config["filter_bioguide_id"] != "":
-        vs_filter["sponsor_bioguide_id"] = ret_config["filter_bioguide_id"]
-    vs_filter = {
-        **vs_filter,
-        "congress_num": {"$in": ret_config["filter_congress_nums"]},
-    }
-    vs_filter = {
-        **vs_filter,
-        "sponsor_party": {"$in": ret_config["filter_sponsor_parties"]},
-    }
-    return vs_filter
-def render_doc_grp(legis_id: str, doc_grp: list[Document]):
-    first_doc = doc_grp[0]
-    congress_gov_url = get_congress_gov_url(
-        first_doc.metadata["congress_num"],
-        first_doc.metadata["legis_type"],
-        first_doc.metadata["legis_num"],
-    )
-    congress_gov_link = f"[congress.gov]({congress_gov_url})"
-    ref = "{} chunks from {}\n\n{}\n\n{}\n\n[{} ({}) ]({})".format(
-        len(doc_grp),
-        first_doc.metadata["legis_id"],
-        first_doc.metadata["title"],
-        congress_gov_link,
-        first_doc.metadata["sponsor_full_name"],
-        first_doc.metadata["sponsor_bioguide_id"],
-        get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
-    )
-    doc_contents = [
-        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
-        for doc in doc_grp
-    ]
-    with st.expander(ref):
-        st.write(escape_markdown("\n\n...\n\n".join(doc_contents)))
-def legis_id_to_link(legis_id: str) -> str:
-    congress_num, legis_type, legis_num = legis_id.split("-")
-    return get_congress_gov_url(congress_num, legis_type, legis_num)
-def legis_id_match_to_link(matchobj):
-    mstring = matchobj.string[matchobj.start() : matchobj.end()]
-    url = legis_id_to_link(mstring)
-    link = f"[{mstring}]({url})"
-    return link
-def replace_legis_ids_with_urls(text):
-    pattern = "11[345678]-[a-z]+-\d{1,5}"
-    rtext = re.sub(pattern, legis_id_match_to_link, text)
-    return rtext
-def render_guide():
-    st.write(
-        """
-When you send a query to LegisQA, it will attempt to retrieve relevant content from the past six congresses ([113th-118th](https://en.wikipedia.org/wiki/List_of_United_States_Congresses)) covering 2013 to the present, pass it to a [large language model (LLM)](https://en.wikipedia.org/wiki/Large_language_model), and generate a response. This technique is known as Retrieval Augmented Generation (RAG). You can read [an academic paper](https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html) or [a high level summary](https://research.ibm.com/blog/retrieval-augmented-generation-RAG) to get more details. Once the response is generated, the retrieved content will be available for inspection with links to the bills and sponsors.
-## Disclaimer
-This is a research project. The RAG technique helps to ground the LLM response by providing context from a trusted source, but it does not guarantee a high quality response. We encourage you to play around, find questions that work and find questions that fail. There is a small monthly budget dedicated to the OpenAI endpoints. Once that is used up each month, queries will no longer work.
-## Config
-Use the `Generative Config` to change LLM parameters.
-Use the `Retrieval Config` to change the number of chunks retrieved from our congress corpus and to apply various filters to the content before it is retrieved (e.g. filter to a specific set of congresses). Use the `Prompt Config` to try out different document formatting and prompting strategies.
-    """
-    )
 def render_example_queries():
     with st.expander("Example Queries"):
@@ -413,7 +222,7 @@ def get_llm(gen_config: dict):
 def create_rag_chain(llm, retriever):
-    QUERY_RAG_TEMPLATE = """You are an expert legislative analyst. Use the following excerpts from US congressional legislation to respond to the user's query. The excerpts are formatted as a JSON list. Each JSON object has "legis_id", "title", "introduced_date", "sponsor", and "snippets" keys. If a snippet is useful in writing part of your response, then cite the "legis_id", "title", "introduced_date", and "sponsor" in the response. If you don't know how to respond, just tell the user.
 ---
@@ -438,7 +247,7 @@ Query: {query}"""
                 "query": RunnablePassthrough(),
             }
         )
-        .assign(context=lambda x: format_docs(x["docs"]))
         .assign(aimessage=prompt | llm)
     )
@@ -446,9 +255,9 @@ Query: {query}"""
 def process_query(gen_config: dict, ret_config: dict, query: str):
-    vectorstore = load_pinecone_vectorstore()
     llm = get_llm(gen_config)
-    vs_filter = get_vectorstore_filter(ret_config)
     retriever = vectorstore.as_retriever(
         search_kwargs={"k": ret_config["n_ret_docs"], "filter": vs_filter},
     )
@@ -457,44 +266,31 @@ def process_query(gen_config: dict, ret_config: dict, query: str):
     return response
-def display_retrieved_chunks(docs: list[Document], tag: str|None=None):
-    with st.container(border=True):
-        doc_grps = group_docs(docs)
-        if tag is None:
-            st.write(
-                "Retrieved Chunks\n\nleft click to expand, right click to follow links"
-            )
-        else:
-            st.write(
-                f"Retrieved Chunks ({tag})\n\nleft click to expand, right click to follow links"
-            )
-        for legis_id, doc_grp in doc_grps:
-            render_doc_grp(legis_id, doc_grp)
-def display_response(
-    response,
     model_info: dict,
     provider: str,
     should_escape_markdown: bool,
     should_add_legis_urls: bool,
-    tag: str|None=None
 ):
-    out_display = response["aimessage"].content
     if should_escape_markdown:
-        out_display = escape_markdown(out_display)
     if should_add_legis_urls:
-        out_display = replace_legis_ids_with_urls(out_display)
     with st.container(border=True):
         if tag is None:
             st.write("Response")
         else:
             st.write(f"Response ({tag})")
-        st.info(out_display)
-    usage.display_api_usage(response, model_info, provider, tag=tag)
-    display_retrieved_chunks(response["docs"], tag=tag)
 def render_query_rag_tab():
@@ -527,7 +323,7 @@ def render_query_rag_tab():
     if response := SS.get(rkey):
         model_info = PROVIDER_MODELS[gen_config["provider"]][gen_config["model_name"]]
-        display_response(
             response,
             model_info,
             gen_config["provider"],
@@ -595,13 +391,13 @@ def render_query_rag_sbs_tab():
                 model_info = PROVIDER_MODELS[gen_configs[post_key_prefix]["provider"]][
                     gen_configs[post_key_prefix]["model_name"]
                 ]
-                display_response(
                     response,
                     model_info,
                     gen_configs[post_key_prefix]["provider"],
                     gen_configs[post_key_prefix]["should_escape_markdown"],
                     gen_configs[post_key_prefix]["should_add_legis_urls"],
-                    tag = grp_names[post_key_prefix],
                 )
@@ -611,7 +407,7 @@ def main():
     st.header("Query Congressional Bills")
     with st.sidebar:
-        render_sidebar()
     query_rag_tab, query_rag_sbs_tab, guide_tab = st.tabs(
         [
@@ -628,7 +424,7 @@ def main():
         render_query_rag_sbs_tab()
     with guide_tab:
-        render_guide()
 if __name__ == "__main__":

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain_core.runnables import RunnablePassthrough
 from langchain_openai import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_together import ChatTogether
 import streamlit as st
+import utils_mod
+import doc_format_mod
+import guide_mod
+import sidebar_mod
+import usage_mod
+import vectorstore_mod
 st.set_page_config(layout="wide", page_title="LegisQA")
 SEED = 292764
 CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118]
 SPONSOR_PARTIES = ["D", "R", "L", "I"]
 OPENAI_CHAT_MODELS = {
     "gpt-4o-mini": {"cost": {"pmi": 0.15, "pmo": 0.60}},
     "gpt-4o": {"cost": {"pmi": 5.00, "pmo": 15.0}},
 }
 def render_example_queries():
     with st.expander("Example Queries"):
 def create_rag_chain(llm, retriever):
+    QUERY_RAG_TEMPLATE = """You are an expert legislative analyst. Use the following excerpts from US congressional legislation to respond to the user's query. The excerpts are formatted as a JSON list. Each JSON object has "legis_id", "title", "introduced_date", "sponsor", and "snippets" keys. If a snippet is useful in writing part of your response, then cite the "legis_id", "title", "introduced_date", and "sponsor" in the response. When citing legis_id, use the same format as the excerpts (e.g. "116-hr-125"). If you don't know how to respond, just tell the user.
 ---
                 "query": RunnablePassthrough(),
             }
         )
+        .assign(context=lambda x: doc_format_mod.format_docs(x["docs"]))
         .assign(aimessage=prompt | llm)
     )
 def process_query(gen_config: dict, ret_config: dict, query: str):
+    vectorstore = vectorstore_mod.load_pinecone_vectorstore()
     llm = get_llm(gen_config)
+    vs_filter = vectorstore_mod.get_vectorstore_filter(ret_config)
     retriever = vectorstore.as_retriever(
         search_kwargs={"k": ret_config["n_ret_docs"], "filter": vs_filter},
     )
     return response
+def render_response(
+    response: dict,
     model_info: dict,
     provider: str,
     should_escape_markdown: bool,
     should_add_legis_urls: bool,
+    tag: str | None = None,
 ):
+    response_text = response["aimessage"].content
     if should_escape_markdown:
+        response_text = utils_mod.escape_markdown(response_text)
     if should_add_legis_urls:
+        response_text = utils_mod.replace_legis_ids_with_urls(response_text)
     with st.container(border=True):
         if tag is None:
             st.write("Response")
         else:
             st.write(f"Response ({tag})")
+        st.info(response_text)
+    usage_mod.display_api_usage(
+        response["aimessage"].response_metadata, model_info, provider, tag=tag
+    )
+    doc_format_mod.render_retrieved_chunks(response["docs"], tag=tag)
 def render_query_rag_tab():
     if response := SS.get(rkey):
         model_info = PROVIDER_MODELS[gen_config["provider"]][gen_config["model_name"]]
+        render_response(
             response,
             model_info,
             gen_config["provider"],
                 model_info = PROVIDER_MODELS[gen_configs[post_key_prefix]["provider"]][
                     gen_configs[post_key_prefix]["model_name"]
                 ]
+                render_response(
                     response,
                     model_info,
                     gen_configs[post_key_prefix]["provider"],
                     gen_configs[post_key_prefix]["should_escape_markdown"],
                     gen_configs[post_key_prefix]["should_add_legis_urls"],
+                    tag=grp_names[post_key_prefix],
                 )
     st.header("Query Congressional Bills")
     with st.sidebar:
+        sidebar_mod.render_sidebar()
     query_rag_tab, query_rag_sbs_tab, guide_tab = st.tabs(
         [
         render_query_rag_sbs_tab()
     with guide_tab:
+        guide_mod.render_guide()
 if __name__ == "__main__":

doc_format_mod.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from collections import defaultdict
+import json
+from langchain.schema import Document
+import streamlit as st
+import utils_mod
+def group_docs(docs) -> list[tuple[str, list[Document]]]:
+    """Group and sort docs.
+    docs are grouped by legis_id
+    inside a legis_id group, the docs are sorted by start_index
+    overall the legis_id groups are sorted by number of docs (desc)
+    doc_grps = [
+        (legis_id, start_index sorted docs), # group with the most docs
+        (legis_id, start_index sorted docs),
+        ...
+        (legis_id, start_index sorted docs), # group with the least docs
+    ]
+    """
+    doc_grps = defaultdict(list)
+    # create legis_id groups
+    for doc in docs:
+        doc_grps[doc.metadata["legis_id"]].append(doc)
+    # sort docs in each group by start index
+    for legis_id in doc_grps.keys():
+        doc_grps[legis_id] = sorted(
+            doc_grps[legis_id],
+            key=lambda x: x.metadata["start_index"],
+        )
+    # sort groups by number of docs
+    doc_grps = sorted(
+        tuple(doc_grps.items()),
+        key=lambda x: -len(x[1]),
+    )
+    return doc_grps
+def format_docs(docs: list[Document]) -> str:
+    """JSON grouped"""
+    doc_grps = group_docs(docs)
+    out = []
+    for legis_id, doc_grp in doc_grps:
+        dd = {
+            "legis_id": doc_grp[0].metadata["legis_id"],
+            "title": doc_grp[0].metadata["title"],
+            "introduced_date": doc_grp[0].metadata["introduced_date"],
+            "sponsor": doc_grp[0].metadata["sponsor_full_name"],
+            "snippets": [doc.page_content for doc in doc_grp],
+        }
+        out.append(dd)
+    return json.dumps(out, indent=4)
+def render_doc_grp(legis_id: str, doc_grp: list[Document]):
+    first_doc = doc_grp[0]
+    congress_gov_url = utils_mod.get_congress_gov_url(
+        first_doc.metadata["congress_num"],
+        first_doc.metadata["legis_type"],
+        first_doc.metadata["legis_num"],
+    )
+    congress_gov_link = f"[congress.gov]({congress_gov_url})"
+    ref = "{} chunks from {}\n\n{}\n\n{}\n\n[{} ({}) ]({})".format(
+        len(doc_grp),
+        first_doc.metadata["legis_id"],
+        first_doc.metadata["title"],
+        congress_gov_link,
+        first_doc.metadata["sponsor_full_name"],
+        first_doc.metadata["sponsor_bioguide_id"],
+        utils_mod.get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
+    )
+    doc_contents = [
+        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
+        for doc in doc_grp
+    ]
+    with st.expander(ref):
+        st.write(utils_mod.escape_markdown("\n\n...\n\n".join(doc_contents)))
+def render_retrieved_chunks(docs: list[Document], tag: str | None = None):
+    with st.container(border=True):
+        doc_grps = group_docs(docs)
+        if tag is None:
+            st.write(
+                "Retrieved Chunks\n\nleft click to expand, right click to follow links"
+            )
+        else:
+            st.write(
+                f"Retrieved Chunks ({tag})\n\nleft click to expand, right click to follow links"
+            )
+        for legis_id, doc_grp in doc_grps:
+            render_doc_grp(legis_id, doc_grp)

guide_mod.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+def render_guide():
+    st.write(
+        """
+When you send a query to LegisQA, it will attempt to retrieve relevant content from the past six congresses ([113th-118th](https://en.wikipedia.org/wiki/List_of_United_States_Congresses)) covering 2013 to the present, pass it to a [large language model (LLM)](https://en.wikipedia.org/wiki/Large_language_model), and generate a response. This technique is known as Retrieval Augmented Generation (RAG). You can read [an academic paper](https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html) or [a high level summary](https://research.ibm.com/blog/retrieval-augmented-generation-RAG) to get more details. Once the response is generated, the retrieved content will be available for inspection with links to the bills and sponsors.
+## Disclaimer
+This is a research project. The RAG technique helps to ground the LLM response by providing context from a trusted source, but it does not guarantee a high quality response. We encourage you to play around, find questions that work and find questions that fail. There is a small monthly budget dedicated to the OpenAI endpoints. Once that is used up each month, queries will no longer work.
+## Config
+Use the `Generative Config` to change LLM parameters.
+Use the `Retrieval Config` to change the number of chunks retrieved from our congress corpus and to apply various filters to the content before it is retrieved (e.g. filter to a specific set of congresses). Use the `Prompt Config` to try out different document formatting and prompting strategies.
+    """
+    )

retriever_tools.py DELETED Viewed

@@ -1,79 +0,0 @@
-"""
-modified from https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/tools/retriever.py
-"""
-from functools import partial
-from typing import Callable
-from typing import Iterable
-from typing import Optional
-from langchain.schema import Document
-from langchain.tools import Tool
-from langchain_core.callbacks.manager import Callbacks
-from langchain_core.pydantic_v1 import BaseModel
-from langchain_core.pydantic_v1 import Field
-from langchain_core.retrievers import BaseRetriever
-class RetrieverInput(BaseModel):
-    """Input to the retriever."""
-    query: str = Field(description="query to look up in retriever")
-def _get_relevant_documents(
-    query: str,
-    retriever: BaseRetriever,
-    format_docs: Callable[[Iterable[Document]], str],
-    callbacks: Callbacks = None,
-) -> str:
-    docs = retriever.get_relevant_documents(query, callbacks=callbacks)
-    return format_docs(docs)
-async def _aget_relevant_documents(
-    query: str,
-    retriever: BaseRetriever,
-    format_docs: Callable[[Iterable[Document]], str],
-    callbacks: Callbacks = None,
-) -> str:
-    docs = await retriever.aget_relevant_documents(query, callbacks=callbacks)
-    return format_docs(docs)
-def get_retriever_tool(
-    retriever: BaseRetriever,
-    name: str,
-    description: str,
-    format_docs: Callable[[Iterable[Document]], str],
-) -> Tool:
-    """Create a tool to do retrieval of documents.
-    Args:
-        retriever: The retriever to use for the retrieval
-        name: The name for the tool. This will be passed to the language model,
-            so should be unique and somewhat descriptive.
-        description: The description for the tool. This will be passed to the language
-            model, so should be descriptive.
-        format_docs: A function to turn an iterable of docs into a string.
-        Returns:
-            Tool class to pass to an agent
-    """
-    func = partial(
-        _get_relevant_documents,
-        retriever=retriever,
-        format_docs=format_docs,
-    )
-    afunc = partial(
-        _aget_relevant_documents,
-        retriever=retriever,
-        format_docs=format_docs,
-    )
-    return Tool(
-        name=name,
-        description=description,
-        func=func,
-        coroutine=afunc,
-        args_schema=RetrieverInput,
-    )

sidebar_mod.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import streamlit as st
+def render_outreach_links():
+    nomic_base_url = "https://atlas.nomic.ai/data/gabrielhyperdemocracy"
+    nomic_map_name = "us-congressional-legislation-s1024o256nomic-1"
+    nomic_url = f"{nomic_base_url}/{nomic_map_name}/map"
+    hf_url = "https://huggingface.co/hyperdemocracy"
+    pc_url = "https://www.pinecone.io/blog/serverless"
+    together_url = "https://www.together.ai/"
+    st.subheader(":brain: About [hyperdemocracy](https://hyperdemocracy.us)")
+    st.subheader(f":world_map: Visualize [nomic atlas]({nomic_url})")
+    st.subheader(f":hugging_face: Raw [huggingface datasets]({hf_url})")
+    st.subheader(f":evergreen_tree: Index [pinecone serverless]({pc_url})")
+    st.subheader(f":pancakes: Inference [together.ai]({together_url})")
+def render_sidebar():
+    with st.container(border=True):
+        render_outreach_links()

usage.py → usage_mod.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import streamlit as st
-def get_openai_token_usage(metadata: dict, model_info: dict):
-    input_tokens = metadata["token_usage"]["prompt_tokens"]
-    output_tokens = metadata["token_usage"]["completion_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
@@ -15,9 +15,9 @@ def get_openai_token_usage(metadata: dict, model_info: dict):
     }
-def get_anthropic_token_usage(metadata: dict, model_info: dict):
-    input_tokens = metadata["usage"]["input_tokens"]
-    output_tokens = metadata["usage"]["output_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
@@ -29,9 +29,9 @@ def get_anthropic_token_usage(metadata: dict, model_info: dict):
     }
-def get_together_token_usage(metadata: dict, model_info: dict):
-    input_tokens = metadata["token_usage"]["prompt_tokens"]
-    output_tokens = metadata["token_usage"]["completion_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
@@ -43,27 +43,27 @@ def get_together_token_usage(metadata: dict, model_info: dict):
     }
-def get_token_usage(metadata: dict, model_info: dict, provider: str):
     match provider:
         case "OpenAI":
-            return get_openai_token_usage(metadata, model_info)
         case "Anthropic":
-            return get_anthropic_token_usage(metadata, model_info)
         case "Together":
-            return get_together_token_usage(metadata, model_info)
         case _:
             raise ValueError()
-def display_api_usage(response, model_info, provider: str, tag: str|None=None):
     with st.container(border=True):
         if tag is None:
             st.write("API Usage")
         else:
             st.write(f"API Usage ({tag})")
-        token_usage = get_token_usage(
-            response["aimessage"].response_metadata, model_info, provider
-        )
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Input Tokens", token_usage["input_tokens"])
@@ -72,4 +72,4 @@ def display_api_usage(response, model_info, provider: str, tag: str|None=None):
         with col3:
             st.metric("Cost", f"${token_usage['cost']:.4f}")
         with st.expander("Response Metadata"):
-            st.warning(response["aimessage"].response_metadata)

 import streamlit as st
+def get_openai_token_usage(response_metadata: dict, model_info: dict):
+    input_tokens = response_metadata["token_usage"]["prompt_tokens"]
+    output_tokens = response_metadata["token_usage"]["completion_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
     }
+def get_anthropic_token_usage(response_metadata: dict, model_info: dict):
+    input_tokens = response_metadata["usage"]["input_tokens"]
+    output_tokens = response_metadata["usage"]["output_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
     }
+def get_together_token_usage(response_metadata: dict, model_info: dict):
+    input_tokens = response_metadata["token_usage"]["prompt_tokens"]
+    output_tokens = response_metadata["token_usage"]["completion_tokens"]
     cost = (
         input_tokens * 1e-6 * model_info["cost"]["pmi"]
         + output_tokens * 1e-6 * model_info["cost"]["pmo"]
     }
+def get_token_usage(response_metadata: dict, model_info: dict, provider: str):
     match provider:
         case "OpenAI":
+            return get_openai_token_usage(response_metadata, model_info)
         case "Anthropic":
+            return get_anthropic_token_usage(response_metadata, model_info)
         case "Together":
+            return get_together_token_usage(response_metadata, model_info)
         case _:
             raise ValueError()
+def display_api_usage(
+    response_metadata: dict, model_info: dict, provider: str, tag: str | None = None
+):
     with st.container(border=True):
         if tag is None:
             st.write("API Usage")
         else:
             st.write(f"API Usage ({tag})")
+        token_usage = get_token_usage(response_metadata, model_info, provider)
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric("Input Tokens", token_usage["input_tokens"])
         with col3:
             st.metric("Cost", f"${token_usage['cost']:.4f}")
         with st.expander("Response Metadata"):
+            st.warning(response_metadata)

utils_mod.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+CONGRESS_GOV_TYPE_MAP = {
+    "hconres": "house-concurrent-resolution",
+    "hjres": "house-joint-resolution",
+    "hr": "house-bill",
+    "hres": "house-resolution",
+    "s": "senate-bill",
+    "sconres": "senate-concurrent-resolution",
+    "sjres": "senate-joint-resolution",
+    "sres": "senate-resolution",
+}
+def escape_markdown(text: str) -> str:
+    MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
+    for char in MD_SPECIAL_CHARS:
+        text = text.replace(char, "\\" + char)
+    return text
+def get_sponsor_url(bioguide_id: str) -> str:
+    return f"https://bioguide.congress.gov/search/bio/{bioguide_id}"
+def get_congress_gov_url(congress_num: int, legis_type: str, legis_num: int) -> str:
+    lt = CONGRESS_GOV_TYPE_MAP[legis_type]
+    return f"https://www.congress.gov/bill/{int(congress_num)}th-congress/{lt}/{int(legis_num)}"
+def legis_id_to_link(legis_id: str) -> str:
+    congress_num, legis_type, legis_num = legis_id.split("-")
+    return get_congress_gov_url(congress_num, legis_type, legis_num)
+def legis_id_match_to_link(matchobj):
+    mstring = matchobj.string[matchobj.start() : matchobj.end()]
+    url = legis_id_to_link(mstring)
+    link = f"[{mstring}]({url})"
+    return link
+def replace_legis_ids_with_urls(text: str) -> str:
+    pattern = "11[345678]-[a-z]+-\d{1,5}"
+    rtext = re.sub(pattern, legis_id_match_to_link, text)
+    return rtext

vectorstore_mod.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_pinecone import PineconeVectorStore
+from langchain_community.vectorstores.utils import DistanceStrategy
+def load_bge_embeddings():
+    model_name = "BAAI/bge-small-en-v1.5"
+    model_kwargs = {"device": "cpu"}
+    encode_kwargs = {"normalize_embeddings": True}
+    emb_fn = HuggingFaceBgeEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs,
+        query_instruction="Represent this question for searching relevant passages: ",
+    )
+    return emb_fn
+def load_pinecone_vectorstore():
+    emb_fn = load_bge_embeddings()
+    vectorstore = PineconeVectorStore(
+        embedding=emb_fn,
+        text_key="text",
+        distance_strategy=DistanceStrategy.COSINE,
+        pinecone_api_key=st.secrets["pinecone_api_key"],
+        index_name=st.secrets["pinecone_index_name"],
+    )
+    return vectorstore
+def get_vectorstore_filter(ret_config: dict) -> dict:
+    vs_filter = {}
+    if ret_config["filter_legis_id"] != "":
+        vs_filter["legis_id"] = ret_config["filter_legis_id"]
+    if ret_config["filter_bioguide_id"] != "":
+        vs_filter["sponsor_bioguide_id"] = ret_config["filter_bioguide_id"]
+    vs_filter = {
+        **vs_filter,
+        "congress_num": {"$in": ret_config["filter_congress_nums"]},
+    }
+    vs_filter = {
+        **vs_filter,
+        "sponsor_party": {"$in": ret_config["filter_sponsor_parties"]},
+    }
+    return vs_filter