Spaces:

nazneen
/

datapoints-explorer

Runtime error

App Files Files Community

nazneen commited on Sep 17, 2022

Commit

fcd4a61

1 Parent(s): 1b6a73c

datapoints explorer app

Browse files

Files changed (4) hide show

README.md +4 -5
app.py +94 -0
assets/data/datapoints_embeddings.parquet +3 -0
requirements.txt +12 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: Datapoints Explorer
-emoji: ⚡
-colorFrom: green
-colorTo: indigo
 sdk: streamlit
-sdk_version: 1.10.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Datapoints Explorer
+emoji: 	📐
+colorFrom: red
+colorTo: purple
 sdk: streamlit
+sdk_version: 1.9.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+## LIBRARIES ###
+## Data
+import pandas as pd
+pd.options.display.float_format = '${:,.2f}'.format
+# Analysis
+# App & Visualization
+import streamlit as st
+from bokeh.models import CustomJS, ColumnDataSource, TextInput, DataTable, TableColumn
+from bokeh.plotting import figure
+from bokeh.transform import factor_cmap
+from bokeh.palettes import Category20c_20
+from bokeh.layouts import column, row
+# utils
+def datasets_explorer_viz(df):
+    s = ColumnDataSource(df)
+    TOOLTIPS= [("dataset_id", "@dataset_id"), ("text", "@text")]
+    color = factor_cmap('dataset_id', palette=Category20c_20, factors=df['dataset_id'].unique())
+    p = figure(plot_width=1000, plot_height=800, tools="hover,wheel_zoom,pan,box_select", tooltips=TOOLTIPS, toolbar_location="above")
+    p.scatter('x', 'y', size=5, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'dataset_id')
+    p.legend.location = "bottom_right"
+    p.legend.click_policy="mute"
+    p.legend.label_text_font_size="8pt"
+    table_source = ColumnDataSource(data=dict())
+    selection_source = ColumnDataSource(data=dict())
+    columns = [
+        # TableColumn(field="x", title="X data"),
+        # TableColumn(field="y", title="Y data"),
+        TableColumn(field="dataset_id", title="Dataset ID"),
+        TableColumn(field="text", title="Text"),
+    ]
+    data_table = DataTable(source=table_source, columns=columns, width=800)
+    p.circle('x', 'y',source=selection_source, size=5, color= 'red')
+    s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code="""
+            const inds = cb_obj.indices;
+            const tableData = table_source.data;
+            const umapData = umap_source.data;
+            tableData['text'] = []
+            tableData['dataset_id'] = []
+            for (let i = 0; i < inds.length; i++) {
+                tableData['text'].push(umapData['text'][inds[i]])
+                tableData['dataset_id'].push(umapData['dataset_id'][inds[i]])
+            }
+            table_source.data = tableData;
+            table_source.change.emit();
+    """
+    ))
+    text_input = TextInput(value="", title="Search")
+    text_input.js_on_change('value', CustomJS(args=dict(plot_source=s, selection_source=selection_source), code="""
+        const plot_data = plot_source.data;
+        const selectData = selection_source.data
+        const value = cb_obj.value
+        selectData['x'] = []
+        selectData['y'] = []
+        selectData['dataset_id'] = []
+        selectData['text'] = []
+        for (var i = 0; i < plot_data['dataset_id'].length; i++) {
+            if (plot_data['dataset_id'][i].includes(value) || plot_data['text'][i].includes(value)) {
+                selectData['x'].push(plot_data['x'][i])
+                selectData['y'].push(plot_data['y'][i])
+                selectData['dataset_id'].push(plot_data['dataset_id'][i])
+                selectData['text'].push(plot_data['text'][i])
+            }
+        }
+        selection_source.change.emit()
+    """))
+    st.bokeh_chart(row(column(text_input,p), data_table))
+if __name__ == "__main__":
+    ### STREAMLIT APP CONGFIG ###
+    st.set_page_config(layout="wide", page_title="Datapoints Explorer")
+    st.title('Interactive Datapoints Explorer for Text Classification')
+    #lcol, rcol = st.columns([2, 2])
+    # ******* loading the mode and the data
+    ### LOAD DATA AND SESSION VARIABLES ###
+    with st.expander("How to interact with the plot:"):
+        st.markdown("* Each point in the plot represents an example from the HF hub text classification datasets.")
+        st.markdown("* The datapoints are emebdded using sentence embeddings of their `text` field.")
+        st.markdown("* You can either search for a datapoint or drag and select to peek into the cluster content.")
+        st.markdown("* If the term you are searching for matches `dataset_id` or `text` it will be highlighted in *red*. The selected points will be summarized as a dataframe on the right.")
+    datasets_df = pd.read_parquet('./assets/data/datapoints_embeddings.parquet')
+    st.warning("Hugging Face 🤗  Datapoints Explorer for Text Classification")
+    datasets_explorer_viz(datasets_df)

assets/data/datapoints_embeddings.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7285649e64341de3b1bc7261f093be42ea7119fe1e079d7ae849115f834b9fd9
+size 1667528

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+bokeh==2.4.1
+Jinja2==3.1.2
+PyYAML==6.0
+numpy==1.22.4
+packaging==21.3
+Pillow==9.1.1
+tornado==6.1
+typing_extensions==4.2.0
+MarkupSafe==2.1.1
+pyparsing==3.0.9
+pandas==1.4.2
+streamlit==1.2.0