Spaces:
Sleeping
Sleeping
from haystack import Document | |
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore | |
from haystack.nodes import BM25Retriever | |
from haystack.pipelines import DocumentSearchPipeline | |
import pandas as pd | |
import panel as pn | |
import param | |
pn.extension('tabulator') | |
pn.extension(sizing_mode="scale_both") | |
import hvplot.pandas | |
# load data | |
infile = "/Users/carolanderson/Dropbox/repos/miscellany/webapps/Agency Inventory AI Usage - Sheet1.tsv" | |
df = pd.read_csv(infile, sep="\t", lineterminator='\n') | |
# rearrange column order | |
col_list = ['Agency', 'Name of Inventory Item', | |
'Primary Type of AI', | |
'Purpose of AI', 'Length of Usage', | |
'Does it directly impact the public?', | |
'Vendor System', | |
'Description of Inventory Item', | |
'Other Notes\r'] | |
df = df[col_list] | |
# remove trailing \r from 'Other Notes' header | |
df = df.rename(columns = {'Other Notes\r' : 'Other Notes'}) | |
# remove trailing spaces from agency names (caused duplicate instance of "DOC") | |
df['Agency'] = df['Agency'].apply(lambda x : x.rstrip()) | |
# columns not useful for filtering | |
no_filter_cols = ['Name of Inventory Item', 'Description of Inventory Item', "Other Notes"] | |
# columns to be used for filtering | |
filter_cols = [c for c in df.columns.unique() if c not in no_filter_cols] | |
# column selector for main plot | |
plot_column_selector = pn.widgets.Select(options=filter_cols, name="Plot category: ") | |
# agency selector for main plot | |
plot_agency_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df["Agency"].unique()), | |
value=["Select all"], | |
name="Optional - filter by agency") | |
# selectors below are all for interactive dataframe | |
agency_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df["Agency"].unique()), | |
value=["Select all"], | |
name="Agency") | |
type_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df['Primary Type of AI'].unique()), | |
value=["Select all"], | |
name='Primary Type of AI') | |
purpose_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df["Purpose of AI"].unique()), | |
value=["Select all"], | |
name="Purpose of AI") | |
length_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df['Length of Usage'].unique()), | |
value=["Select all"], | |
name="Length of Usage") | |
impact_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df['Does it directly impact the public?'].unique()), | |
value=["Select all"], | |
name='Does it directly impact the public?') | |
vendor_selector = pn.widgets.MultiSelect(options=["Select all"] + list(df['Vendor System'].unique()), | |
value=["Select all"], | |
name='Vendor System') | |
row_filters = [agency_selector, type_selector, purpose_selector, length_selector, impact_selector, | |
vendor_selector] | |
def custom_plot(table, column_selector, agency_selector): | |
if "Select all" not in agency_selector: | |
table = table[table['Agency'].isin(agency_selector)] | |
table = table[column_selector].value_counts().sort_values(ascending=True) | |
return table.hvplot.barh(width=600, height=400, color="#336BCC") | |
def custom_table_filter(table, | |
agency_selector, | |
type_selector, | |
purpose_selector, | |
length_selector, | |
impact_selector, | |
vendor_selector): | |
""" | |
This repetitive approach was the only way I could get things working with a | |
'Select all' menu option. | |
""" | |
if "Select all" not in agency_selector: | |
table = table[table["Agency"].isin(agency_selector)] | |
if "Select all" not in type_selector: | |
table = table[table['Primary Type of AI'].isin(type_selector)] | |
if "Select all" not in purpose_selector: | |
table = table[table["Purpose of AI"].isin(purpose_selector)] | |
if "Select all" not in length_selector: | |
table = table[table['Length of Usage'].isin(length_selector)] | |
if "Select all" not in impact_selector: | |
table = table[table['Does it directly impact the public?'].isin(impact_selector)] | |
if "Select all" not in vendor_selector: | |
table = table[table['Vendor System'].isin(vendor_selector)] | |
return table | |
custom_table = pn.widgets.Tabulator(df, pagination="local", page_size=350, layout="fit_data", | |
width=800, height=550) | |
custom_table.add_filter(pn.bind(custom_table_filter, | |
agency_selector=agency_selector, | |
type_selector=type_selector, | |
purpose_selector=purpose_selector, | |
length_selector=length_selector, | |
impact_selector=impact_selector, | |
vendor_selector=vendor_selector)) | |
interactive_plot = pn.bind(custom_plot, table=df, column_selector=plot_column_selector, | |
agency_selector=plot_agency_selector) | |
overview_stacked = pn.Column( | |
pn.pane.Markdown(""" | |
Plot shows the total count of entries, aggregated by various categories. | |
Change the category with the dropdown menu. | |
The total number of records in the database is 337, but some fields have missing values. | |
In particular, 'Vendor System' and 'Primary Type of AI' were not always filled out."""), | |
pn.Column(pn.Row(plot_column_selector, | |
plot_agency_selector), | |
pn.Row(interactive_plot, width=500)) | |
) | |
overview_card = pn.Card(overview_stacked, header="# Overview of the data") | |
filename, button = custom_table.download_menu( | |
text_kwargs={'name': 'Enter filename ending in .csv or .json', 'value': 'default.csv'}, | |
button_kwargs={'name': 'Download table'} | |
) | |
download_card = pn.Card(pn.pane.Markdown(""" | |
Download current table in .csv or .json format. | |
File format will be automatically selected based on the file extension. | |
"""), | |
filename, button, header="### Download") | |
table_card = pn.Card( | |
pn.Row( | |
pn.Column( | |
pn.pane.Markdown(""" | |
### Filter with the menus below | |
"""),pn.WidgetBox(*row_filters), | |
styles=dict(background='#DDE6FF') | |
), pn.Column(pn.pane.Markdown(""" | |
### Scroll horizontally and vertically to see all data | |
"""), custom_table)), | |
download_card, | |
header="# Explore the data" | |
) | |
# stacked bar plot of impact by agency (static plot) | |
impact_counts = df.groupby('Agency')['Does it directly impact the public?'].value_counts() | |
impact_counts = impact_counts.sort_index(level="Agency", ascending=False) | |
impact_count_df = pd.DataFrame(impact_counts).rename(columns={'Does it directly impact the public?' : "Count"}) | |
impact_plot = impact_count_df.hvplot.barh(stacked=True, width=500, height=400, color=[ "#019C6D", "#336BCC", "#F41903",], legend="bottom_right") | |
impact_card = pn.Card( | |
pn.Column( | |
pn.pane.Markdown(""" | |
Number of systems with no, indirect, or direct impact on the public. | |
These judgements were made by Anna Blue and are unique to her report."""), | |
impact_plot), header="# Impact on the public, by agency") | |
# keyword search | |
class TableIndices(param.Parameterized): | |
row_indices = param.List() | |
col_indices = param.List() | |
def __call__(self): | |
return (self.row_indices, self.col_indices) | |
def run_search(text, pipeline): | |
if text == "": | |
return None | |
res = pipeline.run(query=text, params={"Retriever": {"top_k": 10}}) | |
relevant_results = [r for r in res['documents'] if r.score > 0.5] | |
result_rows = [doc.meta['index'] for doc in relevant_results] | |
result_cols = [doc.meta['column_header'] for doc in relevant_results] | |
table_indices = TableIndices(row_indices=result_rows, col_indices=result_cols) | |
return table_indices | |
def produce_table(df, table_indices): | |
if not table_indices: | |
return None | |
result_df = df.iloc[table_indices.row_indices, :] | |
result_df = result_df.drop_duplicates() | |
color_df = result_df.copy() | |
color_df.loc[:,:] = '' | |
for row, col in zip(table_indices.row_indices, table_indices.col_indices): | |
color_df.loc[row, col] = 'background-color: yellow' | |
result_tab = pn.widgets.Tabulator(result_df,pagination="local", page_size=350, layout="fit_data", | |
width=800, height=300) | |
# cell coloration is working, but does not update properly unless empty search is run in between; | |
# otherwise it re-uses the most recent color scheme; maybe related to https://github.com/holoviz/panel/issues/3363 | |
# result_tab.style.apply(lambda x: color_df, axis=None) | |
# giving up for now | |
return result_tab | |
def make_search_pane(result_tab): | |
if not result_tab: | |
return None | |
filename_2, button_2 = result_tab.download_menu( | |
text_kwargs={'name': 'Enter filename ending in .csv or .json', 'value': 'default.csv'}, | |
button_kwargs={'name': 'Download search results'}) | |
search_download_card = pn.Card(pn.pane.Markdown(""" | |
Download search results in .csv or .json format. | |
File format will be automatically selected based on the file extension."""), | |
filename_2, button_2, header="### Download") | |
search_result = pn.Column(pn.pane.Markdown(""" | |
### Scroll horizontally and vertically (if needed) to see everything. | |
"""), result_tab, search_download_card) | |
return search_result | |
# which columns to search | |
col_list = ['Name of Inventory Item', | |
'Primary Type of AI', | |
'Purpose of AI', | |
'Description of Inventory Item', | |
'Other Notes'] | |
# create document store, where each string from any of the relevant columns is a doc | |
# save the row index as metadata | |
docs = [] | |
indices = list(df.index.values) | |
for col in col_list: | |
values = df[col].tolist() | |
assert len(indices) == len(values) | |
for i, val in zip(indices, values): | |
dictionary = {'content' : val, | |
'meta' : {"index": i, "column_header" : col} | |
} | |
docs.append(Document.from_dict(dictionary)) | |
document_store = InMemoryDocumentStore(use_bm25=True) | |
document_store.write_documents(docs) | |
retriever = BM25Retriever(document_store=document_store) | |
pipeline = DocumentSearchPipeline(retriever) | |
text_input = pn.widgets.TextInput(name='Search', placeholder='Enter text here...') | |
result_indices = pn.bind(run_search, text=text_input, pipeline=pipeline) | |
result_table = pn.bind(produce_table, df=df, table_indices=result_indices) | |
result_pane = pn.bind(make_search_pane, result_tab=result_table) | |
search_card = pn.Card( | |
pn.Column( | |
pn.Row( | |
text_input, | |
pn.pane.Markdown(""" | |
This will search text in the following columns: | |
* Name of Inventory Item | |
* Primary Type of AI | |
* Purpose of AI | |
* Description of Inventory Item | |
* Other Notes | |
This is a keyword search based on the BM25 algorithm as implemented in the Haystack python library. | |
""")), | |
pn.Row(result_pane), | |
), | |
header="# Search the text" | |
) | |
main_text = """ | |
The data visualized here come from a report by Anna Blue, a Social Impact Fellow | |
at the Responsible AI Institute. The report was released in May 2023. Some agencies have | |
released updated inventories since then, which are not reflected here. | |
Anna's report consolidated data released by individual government agencies in compliance with | |
Executive Order 13960, which requires federal agencies to produce an annual inventory of their AI usage. | |
See her [blog post](https://www.responsible.ai/post/federal-government-ai-use-cases) for additional details, | |
including links to the original data sources. | |
""" | |
template = pn.template.FastListTemplate( | |
title='U.S. Government Use of AI', | |
main=[pn.pane.Markdown(main_text), | |
pn.Row(overview_card,impact_card), | |
pn.Row(table_card), | |
pn.Row(search_card)], | |
accent_base_color="#FFDAC2", | |
header_background="#0037A2") | |
template.servable() | |