nazneen commited on
Commit
3d8ee64
·
1 Parent(s): 58f2ab9

adding parquets

Browse files
app.py CHANGED
@@ -109,7 +109,7 @@ def quant_panel(embedding_df):
109
  st.markdown("* Each **point** is an input example.")
110
  st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
111
  st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
112
- st.altair_chart(data_comparison(down_samp(embedding_df)))
113
 
114
 
115
  def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
@@ -156,15 +156,11 @@ def get_data(spotlight, emb):
156
 
157
  @st.cache(ttl=600)
158
  def clustering(data,num_clusters):
159
-
160
  X = np.array(data['embedding'].tolist())
161
-
162
  kclusterer = KMeansClusterer(
163
  num_clusters, distance=nltk.cluster.util.cosine_distance,
164
  repeats=25,avoid_empty_clusters=True)
165
-
166
  assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
167
-
168
  data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
169
  data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
170
 
@@ -222,22 +218,18 @@ if __name__ == "__main__":
222
  # ******* loading the mode and the data
223
  dataset = st.sidebar.selectbox(
224
  "Dataset",
225
- ["amazon_polarity", "squad", "movielens", "waterbirds"],
226
- index=0
227
  )
228
 
229
- tokenizer = AutoTokenizer.from_pretrained(
230
- "distilbert-base-uncased-finetuned-sst-2-english")
231
-
232
  model = st.sidebar.selectbox(
233
  "Model",
234
  ["distilbert-base-uncased-finetuned-sst-2-english",
235
- "distilbert-base-uncased-finetuned-sst-2-english"],
236
- index=0
237
  )
238
 
239
  loss_quantile = st.sidebar.slider(
240
- "Loss Quantile", min_value=0.0, max_value=1.0,step=0.01,value=0.95
241
  )
242
 
243
  run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
@@ -245,10 +237,11 @@ if __name__ == "__main__":
245
  num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
246
 
247
  ### LOAD DATA AND SESSION VARIABLES ###
248
- data = pd.read_parquet('./assets/data/amazon_polarity.test.parquet')
249
- embedding_umap = data[['x','y']]
250
- emb_df = pd.read_parquet('./assets/data/amazon_test_emb.parquet')
251
- data_df = pd.DataFrame([data['content'], data['label'], data['pred'], data['loss'], emb_df['embedding'], data['x'], data['y']]).transpose()
 
252
  if "user_data" not in st.session_state:
253
  st.session_state["user_data"] = data_df
254
  if "selected_slice" not in st.session_state:
 
109
  st.markdown("* Each **point** is an input example.")
110
  st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
111
  st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
112
+ st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
113
 
114
 
115
  def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
 
156
 
157
  @st.cache(ttl=600)
158
  def clustering(data,num_clusters):
 
159
  X = np.array(data['embedding'].tolist())
 
160
  kclusterer = KMeansClusterer(
161
  num_clusters, distance=nltk.cluster.util.cosine_distance,
162
  repeats=25,avoid_empty_clusters=True)
 
163
  assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
 
164
  data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
165
  data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
166
 
 
218
  # ******* loading the mode and the data
219
  dataset = st.sidebar.selectbox(
220
  "Dataset",
221
+ ["amazon_polarity", "yelp_polarity"],
222
+ index = 1
223
  )
224
 
 
 
 
225
  model = st.sidebar.selectbox(
226
  "Model",
227
  ["distilbert-base-uncased-finetuned-sst-2-english",
228
+ "albert-base-v2-yelp-polarity"],
 
229
  )
230
 
231
  loss_quantile = st.sidebar.slider(
232
+ "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
233
  )
234
 
235
  run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
 
237
  num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
238
 
239
  ### LOAD DATA AND SESSION VARIABLES ###
240
+ data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
241
+ if model == 'albert-base-v2-yelp-polarity':
242
+ tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
243
+ else:
244
+ tokenizer = AutoTokenizer.from_pretrained(model)
245
  if "user_data" not in st.session_state:
246
  st.session_state["user_data"] = data_df
247
  if "selected_slice" not in st.session_state:
assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce0297bedc66865c01644421ea934008d74807befb7b0bd94aa92729bd02a59
3
+ size 56644779
assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a193c26851f48b7b76a35986ced0dc1fddafd26b92f1aaf9a4e69fd83fd2f2e4
3
+ size 56643545
assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a56147880841c6f78a868fb58f6e97661547009e570c2887ef7c12ffd54474e
3
+ size 103294569
assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165515be2837df9b02f782fe1e7bd3b31bb01c49960e73238f77541eee7589ad
3
+ size 61796202