nazneen commited on
Commit
fcd4a61
β€’
1 Parent(s): 1b6a73c

datapoints explorer app

Browse files
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
  title: Datapoints Explorer
3
- emoji: ⚑
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: streamlit
7
- sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Datapoints Explorer
3
+ emoji: πŸ“
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: streamlit
7
+ sdk_version: 1.9.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## LIBRARIES ###
2
+ ## Data
3
+ import pandas as pd
4
+ pd.options.display.float_format = '${:,.2f}'.format
5
+
6
+ # Analysis
7
+
8
+ # App & Visualization
9
+ import streamlit as st
10
+ from bokeh.models import CustomJS, ColumnDataSource, TextInput, DataTable, TableColumn
11
+ from bokeh.plotting import figure
12
+ from bokeh.transform import factor_cmap
13
+ from bokeh.palettes import Category20c_20
14
+ from bokeh.layouts import column, row
15
+
16
+ # utils
17
+
18
+ def datasets_explorer_viz(df):
19
+ s = ColumnDataSource(df)
20
+ TOOLTIPS= [("dataset_id", "@dataset_id"), ("text", "@text")]
21
+ color = factor_cmap('dataset_id', palette=Category20c_20, factors=df['dataset_id'].unique())
22
+ p = figure(plot_width=1000, plot_height=800, tools="hover,wheel_zoom,pan,box_select", tooltips=TOOLTIPS, toolbar_location="above")
23
+ p.scatter('x', 'y', size=5, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'dataset_id')
24
+ p.legend.location = "bottom_right"
25
+ p.legend.click_policy="mute"
26
+ p.legend.label_text_font_size="8pt"
27
+ table_source = ColumnDataSource(data=dict())
28
+ selection_source = ColumnDataSource(data=dict())
29
+ columns = [
30
+ # TableColumn(field="x", title="X data"),
31
+ # TableColumn(field="y", title="Y data"),
32
+ TableColumn(field="dataset_id", title="Dataset ID"),
33
+ TableColumn(field="text", title="Text"),
34
+ ]
35
+ data_table = DataTable(source=table_source, columns=columns, width=800)
36
+ p.circle('x', 'y',source=selection_source, size=5, color= 'red')
37
+ s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code="""
38
+ const inds = cb_obj.indices;
39
+ const tableData = table_source.data;
40
+ const umapData = umap_source.data;
41
+
42
+ tableData['text'] = []
43
+ tableData['dataset_id'] = []
44
+
45
+ for (let i = 0; i < inds.length; i++) {
46
+ tableData['text'].push(umapData['text'][inds[i]])
47
+ tableData['dataset_id'].push(umapData['dataset_id'][inds[i]])
48
+ }
49
+ table_source.data = tableData;
50
+ table_source.change.emit();
51
+ """
52
+ ))
53
+ text_input = TextInput(value="", title="Search")
54
+
55
+ text_input.js_on_change('value', CustomJS(args=dict(plot_source=s, selection_source=selection_source), code="""
56
+ const plot_data = plot_source.data;
57
+ const selectData = selection_source.data
58
+ const value = cb_obj.value
59
+
60
+ selectData['x'] = []
61
+ selectData['y'] = []
62
+ selectData['dataset_id'] = []
63
+ selectData['text'] = []
64
+
65
+ for (var i = 0; i < plot_data['dataset_id'].length; i++) {
66
+ if (plot_data['dataset_id'][i].includes(value) || plot_data['text'][i].includes(value)) {
67
+ selectData['x'].push(plot_data['x'][i])
68
+ selectData['y'].push(plot_data['y'][i])
69
+ selectData['dataset_id'].push(plot_data['dataset_id'][i])
70
+ selectData['text'].push(plot_data['text'][i])
71
+ }
72
+ }
73
+ selection_source.change.emit()
74
+ """))
75
+
76
+ st.bokeh_chart(row(column(text_input,p), data_table))
77
+
78
+
79
+ if __name__ == "__main__":
80
+ ### STREAMLIT APP CONGFIG ###
81
+ st.set_page_config(layout="wide", page_title="Datapoints Explorer")
82
+ st.title('Interactive Datapoints Explorer for Text Classification')
83
+ #lcol, rcol = st.columns([2, 2])
84
+ # ******* loading the mode and the data
85
+
86
+ ### LOAD DATA AND SESSION VARIABLES ###
87
+ with st.expander("How to interact with the plot:"):
88
+ st.markdown("* Each point in the plot represents an example from the HF hub text classification datasets.")
89
+ st.markdown("* The datapoints are emebdded using sentence embeddings of their `text` field.")
90
+ st.markdown("* You can either search for a datapoint or drag and select to peek into the cluster content.")
91
+ st.markdown("* If the term you are searching for matches `dataset_id` or `text` it will be highlighted in *red*. The selected points will be summarized as a dataframe on the right.")
92
+ datasets_df = pd.read_parquet('./assets/data/datapoints_embeddings.parquet')
93
+ st.warning("Hugging Face πŸ€— Datapoints Explorer for Text Classification")
94
+ datasets_explorer_viz(datasets_df)
assets/data/datapoints_embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7285649e64341de3b1bc7261f093be42ea7119fe1e079d7ae849115f834b9fd9
3
+ size 1667528
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bokeh==2.4.1
2
+ Jinja2==3.1.2
3
+ PyYAML==6.0
4
+ numpy==1.22.4
5
+ packaging==21.3
6
+ Pillow==9.1.1
7
+ tornado==6.1
8
+ typing_extensions==4.2.0
9
+ MarkupSafe==2.1.1
10
+ pyparsing==3.0.9
11
+ pandas==1.4.2
12
+ streamlit==1.2.0