Spaces:

hyperdemocracy
/

hf-legisqa

Running

App Files Files Community

gabrielaltay commited on Feb 26

Commit

723ae91

•

1 Parent(s): 2029299

update

Browse files

Files changed (1) hide show

app.py +162 -98

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from collections import defaultdict
 import json
 from langchain_core.documents import Document
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_openai import ChatOpenAI
@@ -19,6 +21,7 @@ SS = st.session_state
 SEED = 292764
 CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118]
 CONGRESS_GOV_TYPE_MAP = {
     "hconres": "house-concurrent-resolution",
     "hjres": "house-joint-resolution",
@@ -29,7 +32,6 @@ CONGRESS_GOV_TYPE_MAP = {
     "sjres": "senate-joint-resolution",
     "sres": "senate-resolution",
 }
 OPENAI_CHAT_MODELS = [
     "gpt-3.5-turbo-0125",
     "gpt-4-0125-preview",
@@ -115,6 +117,7 @@ def write_outreach_links():
     st.subheader(f":hugging_face: Raw [huggingface datasets]({hf_url})")
     st.subheader(f":evergreen_tree: Index [pinecone serverless]({pc_url})")
 def group_docs(docs) -> list[tuple[str, list[Document]]]:
     doc_grps = defaultdict(list)
@@ -219,15 +222,96 @@ def escape_markdown(text):
     return text
-st.title(":classical_building: LegisQA :classical_building:")
-st.header("Explore Congressional Legislation")
-st.write(
-    """When you send a query to LegisQA, it will attempt to retrieve relevant content from the past six congresses ([113th-118th](https://en.wikipedia.org/wiki/List_of_United_States_Congresses)) covering 2013 to the present, pass it to a [large language model (LLM)](https://en.wikipedia.org/wiki/Large_language_model), and generate a response. This technique is known as Retrieval Augmented Generation (RAG). You can read [an academic paper](https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html) or [a high level summary](https://research.ibm.com/blog/retrieval-augmented-generation-RAG) to get more details. Once the response is generated, the retrieved content will be available for inspection with links to the bills and sponsors.
-This technique helps to ground the LLM response by providing context from a trusted source, but it does not guarantee a high quality response. We encourage you to play around. Try different models. Find questions that work and find questions that fail.""")
-st.header("Example Queries")
-st.write("""
 ```
 What are the themes around artificial intelligence?
 ```
@@ -239,8 +323,15 @@ Write a well cited 3 paragraph essay on food insecurity.
 ```
 Create a table summarizing the major climate change ideas with columns legis_id, title, idea.
 ```
-"""
-)
 with st.sidebar:
@@ -249,6 +340,7 @@ with st.sidebar:
         write_outreach_links()
     st.checkbox("escape markdown in answer", key="response_escape_markdown")
     with st.expander("Generative Config"):
         st.selectbox(label="model name", options=OPENAI_CHAT_MODELS, key="model_name")
@@ -261,20 +353,24 @@ with st.sidebar:
         st.slider(
             "Number of chunks to retrieve",
             min_value=1,
-            max_value=40,
-            value=10,
             key="n_ret_docs",
         )
         st.text_input("Bill ID (e.g. 118-s-2293)", key="filter_legis_id")
         st.text_input("Bioguide ID (e.g. R000595)", key="filter_bioguide_id")
-#        st.text_input("Congress (e.g. 118)", key="filter_congress_num")
         st.multiselect(
             "Congress Numbers",
             CONGRESS_NUMBERS,
             default=CONGRESS_NUMBERS,
             key="filter_congress_nums",
         )
     with st.expander("Prompt Config"):
         st.selectbox(
@@ -297,97 +393,65 @@ llm = ChatOpenAI(
     openai_api_key=st.secrets["openai_api_key"],
     model_kwargs={"top_p": SS["top_p"], "seed": SEED},
 )
 vectorstore = load_pinecone_vectorstore()
 format_docs = DOC_FORMATTERS[SS["prompt_version"]]
-with st.form("my_form"):
-    st.text_area("Enter query:", key="query")
-    query_submitted = st.form_submit_button("Submit")
-def get_vectorstore_filter():
-    vs_filter = {}
-    if SS["filter_legis_id"] != "":
-        vs_filter["legis_id"] = SS["filter_legis_id"]
-    if SS["filter_bioguide_id"] != "":
-        vs_filter["sponsor_bioguide_id"] = SS["filter_bioguide_id"]
-#    if SS["filter_congress_num"] != "":
-#        vs_filter["congress_num"] = int(SS["filter_congress_num"])
-    vs_filter = {"congress_num": {"$in": SS["filter_congress_nums"]}}
-    return vs_filter
-if query_submitted:
-    vs_filter = get_vectorstore_filter()
-    with st.sidebar:
-        with st.expander("Debug vs_filter"):
-            st.write(vs_filter)
-    retriever = vectorstore.as_retriever(
-        search_kwargs={"k": SS["n_ret_docs"], "filter": vs_filter},
-    )
-    prompt = PromptTemplate.from_template(SS["prompt_template"])
-    rag_chain_from_docs = (
-        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
-        | prompt
-        | llm
-        | StrOutputParser()
-    )
-    rag_chain_with_source = RunnableParallel(
-        {"context": retriever, "question": RunnablePassthrough()}
-    ).assign(answer=rag_chain_from_docs)
-    out = rag_chain_with_source.invoke(SS["query"])
-    SS["out"] = out
-def write_doc_grp(legis_id: str, doc_grp: list[Document]):
-    first_doc = doc_grp[0]
-    congress_gov_url = get_congress_gov_url(
-        first_doc.metadata["congress_num"],
-        first_doc.metadata["legis_type"],
-        first_doc.metadata["legis_num"],
-    )
-    congress_gov_link = f"[congress.gov]({congress_gov_url})"
-    gov_track_url = get_govtrack_url(
-        first_doc.metadata["congress_num"],
-        first_doc.metadata["legis_type"],
-        first_doc.metadata["legis_num"],
-    )
-    gov_track_link = f"[govtrack.us]({gov_track_url})"
-    ref = "{} chunks from {}\n\n{}\n\n{} | {}\n\n[{} ({}) ]({})".format(
-        len(doc_grp),
-        first_doc.metadata["legis_id"],
-        first_doc.metadata["title"],
-        congress_gov_link,
-        gov_track_link,
-        first_doc.metadata["sponsor_full_name"],
-        first_doc.metadata["sponsor_bioguide_id"],
-        get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
-    )
-    doc_contents = [
-        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
-        for doc in doc_grp
-    ]
-    with st.expander(ref):
-        st.write(escape_markdown("\n\n...\n\n".join(doc_contents)))
-out = SS.get("out")
-if out:
-    if SS["response_escape_markdown"]:
-        st.info(escape_markdown(out["answer"]))
-    else:
-        st.info(out["answer"])
-    doc_grps = group_docs(out["context"])
-    for legis_id, doc_grp in doc_grps:
-        write_doc_grp(legis_id, doc_grp)
-    with st.expander("Debug doc format"):
-        st.text_area("formatted docs", value=format_docs(out["context"]), height=600)

 from collections import defaultdict
 import json
+import re
 from langchain_core.documents import Document
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
+from langchain_community.callbacks import get_openai_callback
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_openai import ChatOpenAI
 SEED = 292764
 CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118]
+SPONSOR_PARTIES = ["D", "R", "L", "I"]
 CONGRESS_GOV_TYPE_MAP = {
     "hconres": "house-concurrent-resolution",
     "hjres": "house-joint-resolution",
     "sjres": "senate-joint-resolution",
     "sres": "senate-resolution",
 }
 OPENAI_CHAT_MODELS = [
     "gpt-3.5-turbo-0125",
     "gpt-4-0125-preview",
     st.subheader(f":hugging_face: Raw [huggingface datasets]({hf_url})")
     st.subheader(f":evergreen_tree: Index [pinecone serverless]({pc_url})")
 def group_docs(docs) -> list[tuple[str, list[Document]]]:
     doc_grps = defaultdict(list)
     return text
+def get_vectorstore_filter():
+    vs_filter = {}
+    if SS["filter_legis_id"] != "":
+        vs_filter["legis_id"] = SS["filter_legis_id"]
+    if SS["filter_bioguide_id"] != "":
+        vs_filter["sponsor_bioguide_id"] = SS["filter_bioguide_id"]
+    vs_filter = {**vs_filter, "congress_num": {"$in": SS["filter_congress_nums"]}}
+    vs_filter = {**vs_filter, "sponsor_party": {"$in": SS["filter_sponsor_parties"]}}
+    return vs_filter
+def write_doc_grp(legis_id: str, doc_grp: list[Document]):
+    first_doc = doc_grp[0]
+    congress_gov_url = get_congress_gov_url(
+        first_doc.metadata["congress_num"],
+        first_doc.metadata["legis_type"],
+        first_doc.metadata["legis_num"],
+    )
+    congress_gov_link = f"[congress.gov]({congress_gov_url})"
+    gov_track_url = get_govtrack_url(
+        first_doc.metadata["congress_num"],
+        first_doc.metadata["legis_type"],
+        first_doc.metadata["legis_num"],
+    )
+    gov_track_link = f"[govtrack.us]({gov_track_url})"
+    ref = "{} chunks from {}\n\n{}\n\n{}\n\n[{} ({}) ]({})".format(
+        len(doc_grp),
+        first_doc.metadata["legis_id"],
+        first_doc.metadata["title"],
+        congress_gov_link,
+        first_doc.metadata["sponsor_full_name"],
+        first_doc.metadata["sponsor_bioguide_id"],
+        get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
+    )
+    doc_contents = [
+        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
+        for doc in doc_grp
+    ]
+    with st.expander(ref):
+        st.write(escape_markdown("\n\n...\n\n".join(doc_contents)))
+def legis_id_to_link(legis_id: str) -> str:
+    congress_num, legis_type, legis_num = legis_id.split("-")
+    return get_congress_gov_url(congress_num, legis_type, legis_num)
+def legis_id_match_to_link(matchobj):
+    mstring = matchobj.string[matchobj.start() : matchobj.end()]
+    url = legis_id_to_link(mstring)
+    link = f"[{mstring}]({url})"
+    return link
+def replace_legis_ids_with_urls(text):
+    pattern = "11[345678]-[a-z]+-\d{1,5}"
+    rtext = re.sub(pattern, legis_id_match_to_link, text)
+    return rtext
+def write_guide():
+    st.write(
+        """
+When you send a query to LegisQA, it will attempt to retrieve relevant content from the past six congresses ([113th-118th](https://en.wikipedia.org/wiki/List_of_United_States_Congresses)) covering 2013 to the present, pass it to a [large language model (LLM)](https://en.wikipedia.org/wiki/Large_language_model), and generate a response. This technique is known as Retrieval Augmented Generation (RAG). You can read [an academic paper](https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html) or [a high level summary](https://research.ibm.com/blog/retrieval-augmented-generation-RAG) to get more details. Once the response is generated, the retrieved content will be available for inspection with links to the bills and sponsors.
+## Disclaimer
+This is a research project. The RAG technique helps to ground the LLM response by providing context from a trusted source, but it does not guarantee a high quality response. We encourage you to play around, find questions that work and find questions that fail. There is a small monthly budget dedicated to the OpenAI endpoints. Once that is used up each month, queries will no longer work.
+## Sidebar Config
+Use the `Generative Config` to change LLM parameters.
+Use the `Retrieval Config` to change the number of chunks retrieved from our congress corpus and to apply various filters to the content before it is retrieved (e.g. filter to a specific set of congresses). Use the `Prompt Config` to try out different document formatting and prompting strategies.
+    """
+    )
+def write_example_queries():
+    with st.expander("Example Queries"):
+        st.write(
+            """
 ```
 What are the themes around artificial intelligence?
 ```
 ```
 Create a table summarizing the major climate change ideas with columns legis_id, title, idea.
 ```
+        """
+        )
+##################
+st.title(":classical_building: LegisQA :classical_building:")
 with st.sidebar:
         write_outreach_links()
     st.checkbox("escape markdown in answer", key="response_escape_markdown")
+    st.checkbox("add legis urls in answer", value=True, key="response_add_legis_urls")
     with st.expander("Generative Config"):
         st.selectbox(label="model name", options=OPENAI_CHAT_MODELS, key="model_name")
         st.slider(
             "Number of chunks to retrieve",
             min_value=1,
+            max_value=32,
+            value=8,
             key="n_ret_docs",
         )
         st.text_input("Bill ID (e.g. 118-s-2293)", key="filter_legis_id")
         st.text_input("Bioguide ID (e.g. R000595)", key="filter_bioguide_id")
         st.multiselect(
             "Congress Numbers",
             CONGRESS_NUMBERS,
             default=CONGRESS_NUMBERS,
             key="filter_congress_nums",
         )
+        st.multiselect(
+            "Sponsor Party",
+            SPONSOR_PARTIES,
+            default=SPONSOR_PARTIES,
+            key="filter_sponsor_parties",
+        )
     with st.expander("Prompt Config"):
         st.selectbox(
     openai_api_key=st.secrets["openai_api_key"],
     model_kwargs={"top_p": SS["top_p"], "seed": SEED},
 )
 vectorstore = load_pinecone_vectorstore()
 format_docs = DOC_FORMATTERS[SS["prompt_version"]]
+vs_filter = get_vectorstore_filter()
+query_tab, guide_tab = st.tabs(["query", "guide"])
+with guide_tab:
+    write_guide()
+with query_tab:
+    write_example_queries()
+    with st.form("my_form"):
+        st.text_area("Enter query:", key="query")
+        query_submitted = st.form_submit_button("Submit")
+    if query_submitted:
+        retriever = vectorstore.as_retriever(
+            search_kwargs={"k": SS["n_ret_docs"], "filter": vs_filter},
+        )
+        prompt = PromptTemplate.from_template(SS["prompt_template"])
+        rag_chain_from_docs = (
+            RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
+            | prompt
+            | llm
+            | StrOutputParser()
+        )
+        rag_chain_with_source = RunnableParallel(
+            {"context": retriever, "question": RunnablePassthrough()}
+        ).assign(answer=rag_chain_from_docs)
+        with get_openai_callback() as cb:
+            SS["out"] = rag_chain_with_source.invoke(SS["query"])
+            SS["cb"] = cb
+    if "out" in SS:
+        out_display = SS["out"]["answer"]
+        if SS["response_escape_markdown"]:
+            out_display = escape_markdown(out_display)
+        if SS["response_add_legis_urls"]:
+            out_display = replace_legis_ids_with_urls(out_display)
+        with st.container(border=True):
+            st.write("Response")
+            st.info(out_display)
+        with st.container(border=True):
+            st.write("API Usage")
+            st.warning(SS["cb"])
+        with st.container(border=True):
+            doc_grps = group_docs(SS["out"]["context"])
+            st.write(
+                "Retrieved Chunks (note that you may need to 'right click' on links in the expanders to follow them)"
+            )
+            for legis_id, doc_grp in doc_grps:
+                write_doc_grp(legis_id, doc_grp)
+# with st.expander("Debug doc format"):
+#    st.text_area("formatted docs", value=format_docs(SS["out"]["context"]), height=600)