meg-huggingface commited on
Commit
14e5c2a
1 Parent(s): 1a4c18a

Try..except catching for errors

Browse files
Files changed (2) hide show
  1. app.py +48 -18
  2. data_measurements/streamlit_utils.py +67 -64
app.py CHANGED
@@ -150,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
150
  mkdir(CACHE_DIR)
151
  if use_cache:
152
  logs.warning("Using cache")
153
- dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
154
- # Don't recalculate; we're live
155
- dstats.set_deployment(True)
156
- # We need to have the text_dset loaded for further load_or_prepare
157
- dstats.load_or_prepare_dataset()
158
- # Header widget
159
- dstats.load_or_prepare_dset_peek()
160
- # General stats widget
161
- dstats.load_or_prepare_general_stats()
162
- # Labels widget
163
- dstats.load_or_prepare_labels()
164
- # Text lengths widget
165
- dstats.load_or_prepare_text_lengths()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  if show_embeddings:
167
- # Embeddings widget
168
- dstats.load_or_prepare_embeddings()
169
- dstats.load_or_prepare_text_duplicates()
170
- dstats.load_or_prepare_npmi()
171
- dstats.load_or_prepare_zipf()
 
 
 
 
 
 
 
 
 
 
 
 
172
  return dstats
173
 
174
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
 
150
  mkdir(CACHE_DIR)
151
  if use_cache:
152
  logs.warning("Using cache")
153
+ try:
154
+ dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
155
+ # Don't recalculate; we're live
156
+ dstats.set_deployment(True)
157
+ except:
158
+ logs.warning("We're screwed")
159
+ try:
160
+ # We need to have the text_dset loaded for further load_or_prepare
161
+ dstats.load_or_prepare_dataset()
162
+ except:
163
+ logs.warning("Missing a cache for load or prepare dataset")
164
+ try:
165
+ # Header widget
166
+ dstats.load_or_prepare_dset_peek()
167
+ except:
168
+ logs.warning("Missing a cache for dset peek")
169
+ try:
170
+ # General stats widget
171
+ dstats.load_or_prepare_general_stats()
172
+ except:
173
+ logs.warning("Missing a cache for general stats")
174
+ try:
175
+ # Labels widget
176
+ dstats.load_or_prepare_labels()
177
+ except:
178
+ logs.warning("Missing a cache for prepare labels")
179
+ try:
180
+ # Text lengths widget
181
+ dstats.load_or_prepare_text_lengths()
182
+ except:
183
+ logs.warning("Missing a cache for text lengths")
184
  if show_embeddings:
185
+ try:
186
+ # Embeddings widget
187
+ dstats.load_or_prepare_embeddings()
188
+ except:
189
+ logs.warning("Missing a cache for embeddings")
190
+ try:
191
+ dstats.load_or_prepare_text_duplicates()
192
+ except:
193
+ logs.warning("Missing a cache for text duplicates")
194
+ try:
195
+ dstats.load_or_prepare_npmi()
196
+ except:
197
+ logs.warning("Missing a cache for npmi")
198
+ try:
199
+ dstats.load_or_prepare_zipf()
200
+ except:
201
+ logs.warning("Missing a cache for zipf")
202
  return dstats
203
 
204
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
data_measurements/streamlit_utils.py CHANGED
@@ -319,72 +319,75 @@ def expander_npmi_description(min_vocab):
319
 
320
  ### Finally, show Zipf stuff
321
  def expander_zipf(z, zipf_fig, column_id):
322
- _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
323
- natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
324
- calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
325
-
326
- powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
327
- zipf_summary = (
328
- "The optimal alpha based on this dataset is: **"
329
- + str(round(z.alpha, 2))
330
- + "**, with a KS distance of: **"
331
- + str(round(z.distance, 2))
332
- )
333
- zipf_summary += (
334
- "**. This was fit with a minimum rank value of: **"
335
- + str(int(z.xmin))
336
- + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
337
- )
338
-
339
- alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
340
- xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
341
- fit_results_table = pd.DataFrame.from_dict(
342
- {
343
- r"Alpha:": [str("%.2f" % z.alpha)],
344
- "KS distance:": [str("%.2f" % z.distance)],
345
- "Min rank:": [str("%s" % int(z.xmin))],
346
- },
347
- columns=["Results"],
348
- orient="index",
349
- )
350
- fit_results_table.index.name = column_id
351
  with st.expander(
352
  f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
353
  ):
354
- st.caption(
355
- "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
356
- )
357
- st.markdown(_ZIPF_CAPTION)
358
- st.write(
359
- """
360
- A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
361
- with an ideal α value of 1."""
362
- )
363
- st.markdown(
364
- "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
365
- )
366
- st.markdown(
367
- "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
368
- )
369
- st.markdown("-----")
370
- st.write("### Here is your dataset's Zipf results:")
371
- st.dataframe(fit_results_table)
372
- st.write(zipf_summary)
373
- # TODO: Nice UI version of the content in the comments.
374
- # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
375
- # if z.ks_test.pvalue < 0.01:
376
- # st.markdown(
377
- # "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
378
- # else:
379
- # st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
380
- # st.markdown("Checking the goodness of fit of our observed distribution")
381
- # st.markdown("to the hypothesized power law distribution")
382
- # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
383
- st.plotly_chart(zipf_fig, use_container_width=True)
384
- if z.alpha > 2:
385
- st.markdown(alpha_warning)
386
- if z.xmin > 5:
387
- st.markdown(xmin_warning)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
 
390
  ### Finally finally finally, show nPMI stuff.
@@ -427,7 +430,7 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
427
 
428
  def npmi_show(paired_results):
429
  if paired_results.empty:
430
- st.markdown("No words that co-occur enough times for results! Or there's a 🐛.")
431
  else:
432
  s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
433
  # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
 
319
 
320
  ### Finally, show Zipf stuff
321
  def expander_zipf(z, zipf_fig, column_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  with st.expander(
323
  f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
324
  ):
325
+ try:
326
+ _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
327
+ natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
328
+ calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
329
+
330
+ powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
331
+ zipf_summary = (
332
+ "The optimal alpha based on this dataset is: **"
333
+ + str(round(z.alpha, 2))
334
+ + "**, with a KS distance of: **"
335
+ + str(round(z.distance, 2))
336
+ )
337
+ zipf_summary += (
338
+ "**. This was fit with a minimum rank value of: **"
339
+ + str(int(z.xmin))
340
+ + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
341
+ )
342
+
343
+ alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
344
+ xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
345
+ fit_results_table = pd.DataFrame.from_dict(
346
+ {
347
+ r"Alpha:": [str("%.2f" % z.alpha)],
348
+ "KS distance:": [str("%.2f" % z.distance)],
349
+ "Min rank:": [str("%s" % int(z.xmin))],
350
+ },
351
+ columns=["Results"],
352
+ orient="index",
353
+ )
354
+ fit_results_table.index.name = column_id
355
+ st.caption(
356
+ "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
357
+ )
358
+ st.markdown(_ZIPF_CAPTION)
359
+ st.write(
360
+ """
361
+ A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
362
+ with an ideal α value of 1."""
363
+ )
364
+ st.markdown(
365
+ "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
366
+ )
367
+ st.markdown(
368
+ "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
369
+ )
370
+ st.markdown("-----")
371
+ st.write("### Here is your dataset's Zipf results:")
372
+ st.dataframe(fit_results_table)
373
+ st.write(zipf_summary)
374
+ # TODO: Nice UI version of the content in the comments.
375
+ # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
376
+ # if z.ks_test.pvalue < 0.01:
377
+ # st.markdown(
378
+ # "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
379
+ # else:
380
+ # st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
381
+ # st.markdown("Checking the goodness of fit of our observed distribution")
382
+ # st.markdown("to the hypothesized power law distribution")
383
+ # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
384
+ st.plotly_chart(zipf_fig, use_container_width=True)
385
+ if z.alpha > 2:
386
+ st.markdown(alpha_warning)
387
+ if z.xmin > 5:
388
+ st.markdown(xmin_warning)
389
+ except:
390
+ st.write("Under construction!")
391
 
392
 
393
  ### Finally finally finally, show nPMI stuff.
 
430
 
431
  def npmi_show(paired_results):
432
  if paired_results.empty:
433
+ st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
434
  else:
435
  s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
436
  # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])