meg-huggingface commited on
Commit
db74ba9
1 Parent(s): 0803ab3

Scripts to generate cache

Browse files
Files changed (2) hide show
  1. run.sh +112 -0
  2. run_data_measurements.py +8 -6
run.sh ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+
4
+ python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --label_field="label" --feature="text"
5
+ python3 run_data_measurements.py --dataset="hate_speech_offensive" --config="default" --split="train" --label_field="label" --feature="tweet"
6
+
7
+
8
+ python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
9
+ python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="unsupervised" --label_field="label" --feature="text"
10
+
11
+
12
+ python3 run_data_measurements.py --dataset="glue" --config="cola" --split="train" --label_field="label" --feature="sentence"
13
+ python3 run_data_measurements.py --dataset="glue" --config="cola" --split="validation" --label_field="label" --feature="sentence"
14
+
15
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="hypothesis"
16
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="premise"
17
+
18
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="premise"
19
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="hypothesis"
20
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="premise"
21
+ python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="hypothesis"
22
+
23
+
24
+ python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence1"
25
+ python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence2"
26
+ python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence1"
27
+ python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence2"
28
+
29
+
30
+ python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence1"
31
+ python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence2"
32
+ python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence1"
33
+ python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence2"
34
+
35
+
36
+ python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence1"
37
+ python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence2"
38
+ python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence1"
39
+ python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence2"
40
+
41
+ python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence1"
42
+ python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence2"
43
+ python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence1"
44
+ python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence2"
45
+
46
+ python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="train" --label_field="label" --feature="sentence"
47
+ python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="validation" --label_field="label" --feature="sentence"
48
+
49
+
50
+ python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="question"
51
+ python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="sentence"
52
+ python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="question"
53
+ python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="sentence"
54
+
55
+
56
+ python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question1"
57
+ python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question2"
58
+ python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question1"
59
+ python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question2"
60
+
61
+ python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="hypothesis"
62
+ python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="premise"
63
+ python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="hypothesis"
64
+ python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="premise"
65
+
66
+
67
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="train" --feature="text"
68
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="train" --feature="text"
69
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="train" --feature="text"
70
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="train" --feature="text"
71
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="validation" --feature="text"
72
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="validation" --feature="text"
73
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="validation" --feature="text"
74
+ python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="validation" --feature="text"
75
+
76
+
77
+ # Superglue wsc? wic? rte? record? multirc?
78
+
79
+ python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="question"
80
+ python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="question"
81
+ python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="passage"
82
+ python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="passage"
83
+
84
+ python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="premise"
85
+ python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="premise"
86
+ python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="hypothesis"
87
+ python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="hypothesis"
88
+
89
+
90
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="premise"
91
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="premise"
92
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice1"
93
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice1"
94
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice2"
95
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice2"
96
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="question"
97
+ python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="question"
98
+
99
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="context"
100
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="question"
101
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="title"
102
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="context"
103
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="question"
104
+ python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="title"
105
+
106
+
107
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="context"
108
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="question"
109
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="title"
110
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="context"
111
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="question"
112
+ python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="title"
run_data_measurements.py CHANGED
@@ -25,7 +25,11 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
25
  # General stats widget
26
  dstats.load_or_prepare_general_stats()
27
  # Labels widget
28
- dstats.load_or_prepare_labels()
 
 
 
 
29
  # Text lengths widget
30
  dstats.load_or_prepare_text_lengths()
31
  if show_embeddings:
@@ -76,9 +80,10 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
76
  print("Figure saved to %s." % fig_tok_length_fid)
77
  print("Done!")
78
 
79
- if (all and dstats.label_field) or dataset_args["calculation"] == "labels":
80
  if not dstats.label_field:
81
- print("Warning: You asked for label calculation, but didn't provide the labels field name. Assuming it is 'label'...")
 
82
  dstats.set_label_field("label")
83
  print("\n* Calculating label distribution.")
84
  dstats.load_or_prepare_labels()
@@ -188,9 +193,6 @@ def main():
188
  Example for hate speech18 dataset:
189
  python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"
190
 
191
- Example for Glue dataset:
192
- python3 run_data_measurements.py --dataset="glue" --config="ax" --split="train" --feature="premise"
193
-
194
  Example for IMDB dataset:
195
  python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
196
  """
25
  # General stats widget
26
  dstats.load_or_prepare_general_stats()
27
  # Labels widget
28
+ try:
29
+ dstats.set_label_field("label")
30
+ dstats.load_or_prepare_labels()
31
+ except:
32
+ pass
33
  # Text lengths widget
34
  dstats.load_or_prepare_text_lengths()
35
  if show_embeddings:
80
  print("Figure saved to %s." % fig_tok_length_fid)
81
  print("Done!")
82
 
83
+ if all or dataset_args["calculation"] == "labels":
84
  if not dstats.label_field:
85
+ print("Warning: You asked for label calculation, but didn't provide "
86
+ "the labels field name. Assuming it is 'label'...")
87
  dstats.set_label_field("label")
88
  print("\n* Calculating label distribution.")
89
  dstats.load_or_prepare_labels()
193
  Example for hate speech18 dataset:
194
  python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"
195
 
 
 
 
196
  Example for IMDB dataset:
197
  python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
198
  """