w32zhong commited on
Commit
f5b3bba
1 Parent(s): f75f323

update model

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ events.out.tfevents.1631471189.blg4302.int.ets1.calculquebec.ca.240020.0 filter=lfs diff=lfs merge=lfs -text
2
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"$pi$": 30522, "$vert$": 30523, "$alpha$": 30524, "$minus$": 30525, "$p$": 30526, "$frac$": 30527, "$q$": 30528, "$lt$": 30529, "$1$": 30530, "$($": 30531, "$2$": 30532, "$supscript$": 30533, "$)$": 30534, "$varepsilon$": 30535, "$x$": 30536, "$subscript$": 30537, "${$": 30538, "$n$": 30539, "$}$": 30540, "$v$": 30541, "$a$": 30542, "$r$": 30543, "$rightarrow$": 30544, "$infty$": 30545, "$omega$": 30546, "$equal$": 30547, "$[$": 30548, "$comma$": 30549, "$]$": 30550, "$\\begin{cases}$": 30551, "$column$": 30552, "$0$": 30553, "$le$": 30554, "$row$": 30555, "$ast$": 30556, "$\\end{cases}$": 30557, "$qquad$": 30558, "$in$": 30559, "$e$": 30560, "$to$": 30561, "$l$": 30562, "$phi$": 30563, "$colon$": 30564, "$z$": 30565, "$i$": 30566, "$5$": 30567, "$plus$": 30568, "$cong$": 30569, "$b$": 30570, "$mod$": 30571, "$ker$": 30572, "$mapsto$": 30573, "$equiv$": 30574, "$subset$": 30575, "$f$": 30576, "$y$": 30577, "$sin$": 30578, "$times$": 30579, "$cos$": 30580, "$t$": 30581, "$float$": 30582, "$fact$": 30583, "$3$": 30584, "$dots$": 30585, "$k$": 30586, "$sum$": 30587, "$gt$": 30588, "$\\left($": 30589, "$\\right)$": 30590, "$m$": 30591, "$ldots$": 30592, "$9$": 30593, "$leftarrow$": 30594, "$prime$": 30595, "$h$": 30596, "$approx$": 30597, "$root$": 30598, "$pm$": 30599, "$c$": 30600, "$d$": 30601, "$beta$": 30602, "$ge$": 30603, "$g$": 30604, "$epsilon$": 30605, "$j$": 30606, "$u$": 30607, "$aleph$": 30608, "$neq$": 30609, "$cdots$": 30610, "$lambda$": 30611, "$mu$": 30612, "$nu$": 30613, "$geq$": 30614, "$\\{$": 30615, "$\\}$": 30616, "$s$": 30617, "$subseteq$": 30618, "$max$": 30619, "$kappa$": 30620, "$oplus$": 30621, "$dim$": 30622, "$prod$": 30623, "$8$": 30624, "$somenum$": 30625, "$circ$": 30626, "$4$": 30627, "$7$": 30628, "$forall$": 30629, "$ne$": 30630, "$o$": 30631, "$w$": 30632, "$arg$": 30633, "$exp$": 30634, "$6$": 30635, "$\\begin{pmatrix}$": 30636, "$\\end{pmatrix}$": 30637, "$theta$": 30638, "$tan$": 30639, "$semicolon$": 30640, "$quad$": 30641, "$newline$": 30642, "$ni$": 30643, "$cup$": 30644, "$varpi$": 30645, "$vee$": 30646, "$langle$": 30647, "$rangle$": 30648, "$delta$": 30649, "$gg$": 30650, "$log$": 30651, "$\\lceil$": 30652, "$\\rceil$": 30653, "$\\begin{array}$": 30654, "$\\end{array}$": 30655, "$pr$": 30656, "$bigcap$": 30657, "$sim$": 30658, "$lim$": 30659, "$\\left [$": 30660, "$\\right ]$": 30661, "$\\left ($": 30662, "$\\right )$": 30663, "$longmapsto$": 30664, "$rvect$": 30665, "$rho$": 30666, "$int$": 30667, "$nabla$": 30668, "$sigma$": 30669, "$cap$": 30670, "$iff$": 30671, "$\\over$": 30672, "$wedge$": 30673, "$bigoplus$": 30674, "$otimes$": 30675, "$partial$": 30676, "$\\begin{bmatrix}$": 30677, "$\\end{bmatrix}$": 30678, "$\\left\\langle$": 30679, "$\\right\\rangle$": 30680, "$\\left\\$": 30681, "$\\right\\$": 30682, "$\\left|$": 30683, "$\\right|$": 30684, "$chi$": 30685, "$\\left.$": 30686, "$psi$": 30687, "$gamma$": 30688, "$cot$": 30689, "$\\left[$": 30690, "$\\right]$": 30691, "$zeta$": 30692, "$xrightarrow$": 30693, "$ln$": 30694, "$setminus$": 30695, "$unlhd$": 30696, "$xi$": 30697, "$ll$": 30698, "$implies$": 30699, "$uparrow$": 30700, "$\\underbrace$": 30701, "$cr$": 30702, "$longleftrightarrow$": 30703, "$tau$": 30704, "$mid$": 30705, "$varphi$": 30706, "$geqslant$": 30707, "$angle$": 30708, "$longrightarrow$": 30709, "$exists$": 30710, "$inf$": 30711, "$sup$": 30712, "$\\begin{matrix}$": 30713, "$\\end{matrix}$": 30714, "$top$": 30715, "$bot$": 30716, "$simeq$": 30717, "$det$": 30718, "$\\right/$": 30719, "$\\overset$": 30720, "$emptyset$": 30721, "$\\stackrel$": 30722, "$\\left\\{$": 30723, "$\\lfloor$": 30724, "$\\rfloor$": 30725, "$\\right.$": 30726, "$binom$": 30727, "$ell$": 30728, "$sec$": 30729, "$arccos$": 30730, "$vdots$": 30731, "$ddots$": 30732, "$\\right\\}$": 30733, "$leqslant$": 30734, "$eta$": 30735, "$\\begin{smallmatrix}$": 30736, "$\\end{smallmatrix}$": 30737, "$percent$": 30738, "$oint$": 30739, "$min$": 30740, "$hbar$": 30741, "$ddot$": 30742, "$varnothing$": 30743, "$cosh$": 30744, "$downarrow$": 30745, "$gcd$": 30746, "$rightarrowtail$": 30747, "$supset$": 30748, "$\\underset$": 30749, "$bigcup$": 30750, "$preceq$": 30751, "$\\array{$": 30752, "$triangleq$": 30753, "$iota$": 30754, "$leftrightarrow$": 30755, "$arctan$": 30756, "$arcsin$": 30757, "$sinh$": 30758, "$triangle$": 30759, "$coprod$": 30760, "$neg$": 30761, "$land$": 30762, "$lor$": 30763, "$measuredangle$": 30764, "$wp$": 30765, "$backslash$": 30766, "$vartheta$": 30767, "$odot$": 30768, "$perp$": 30769, "$tanh$": 30770, "$trianglelefteq$": 30771, "$\\left\\lfloor$": 30772, "$\\right\\rfloor$": 30773, "$supseteq$": 30774, "$sign$": 30775, "$dotsc$": 30776, "$nmid$": 30777, "$smallsetminus$": 30778, "$and$": 30779, "$\\left\\lceil$": 30780, "$\\right\\rceil$": 30781, "$deg$": 30782, "$impliedby$": 30783, "$\\left<$": 30784, "$\\right>$": 30785, "$searrow$": 30786, "$limsup$": 30787, "$succeq$": 30788, "$rightharpoonup$": 30789, "$bigtriangleup$": 30790, "$sqcup$": 30791, "$subsetneq$": 30792, "$\\left \\{$": 30793, "$\\right \\}$": 30794, "$\\left |$": 30795, "$\\right |$": 30796, "$nle$": 30797, "$lnot$": 30798, "$iint$": 30799, "$hom$": 30800, "$leadsto$": 30801, "$nexists$": 30802, "$re$": 30803, "$\\begin{vmatrix}$": 30804, "$\\end{vmatrix}$": 30805, "$parallel$": 30806, "$dotsb$": 30807, "$bigwedge$": 30808, "$succ$": 30809, "$\\buildrel$": 30810, "$liminf$": 30811, "$csc$": 30812, "$wr$": 30813, "$hookrightarrow$": 30814, "$\\lbrace$": 30815, "$\\rbrace$": 30816, "$dotsm$": 30817, "$rtimes$": 30818, "$ltimes$": 30819, "$\\of$": 30820, "$updownarrow$": 30821, "$nearrow$": 30822, "$\\left \\lfloor$": 30823, "$\\right \\rfloor$": 30824, "$approxeq$": 30825, "$dashv$": 30826, "$bigcirc$": 30827, "$triangledown$": 30828, "$lcm$": 30829, "$prec$": 30830, "$propto$": 30831, "$triangleleft$": 30832, "$ncong$": 30833, "$coth$": 30834, "$longleftarrow$": 30835, "$upsilon$": 30836, "$thicksim$": 30837, "$\\left$": 30838, "$\\right$": 30839, "$bigtriangledown$": 30840, "$varliminf$": 30841, "$varlimsup$": 30842, "$atop$": 30843, "$\\overbrace$": 30844, "$\\left \\langle$": 30845, "$\\right \\rangle$": 30846, "$gets$": 30847, "$vartriangleleft$": 30848, "$iiint$": 30849, "$varinjlim$": 30850, "$varprojlim$": 30851, "$bigotimes$": 30852, "$varrho$": 30853, "$lesssim$": 30854, "$\\left \\$": 30855, "$\\right \\$": 30856, "$bigsqcup$": 30857, "$supsetneq$": 30858, "$curvearrowright$": 30859, "$dotso$": 30860, "$preccurlyeq$": 30861, "$imath$": 30862, "$omicron$": 30863, "$nrightarrow$": 30864, "$lneq$": 30865, "$upharpoonright$": 30866, "$nsubseteq$": 30867, "$enspace$": 30868, "$geqq$": 30869, "$rightrightarrows$": 30870, "$uplus$": 30871, "$owns$": 30872, "$rightsquigarrow$": 30873, "$vartriangle$": 30874, "$trianglerighteq$": 30875, "$amalg$": 30876, "$frown$": 30877, "$jmath$": 30878, "$\\left <$": 30879, "$\\right >$": 30880, "$beth$": 30881, "$sphericalangle$": 30882, "$gtrapprox$": 30883, "$lessapprox$": 30884, "$bigvee$": 30885, "$nsim$": 30886, "$swarrow$": 30887, "$asymp$": 30888, "$bigodot$": 30889, "$dotsi$": 30890, "$\\left |$": 30891, "$intop$": 30892, "$sqsupset$": 30893, "$\\left \\lceil$": 30894, "$\\right \\rceil$": 30895, "$empty$": 30896, "$rightleftharpoons$": 30897, "$\\left .$": 30898, "$smallint$": 30899, "$gtrless$": 30900, "$ngtr$": 30901, "$\\lbrack$": 30902, "$arrowvert$": 30903, "$xleftarrow$": 30904, "$\\left \\{$": 30905, "$\\right \\}$": 30906, "$vardelta$": 30907, "$looparrowright$": 30908, "$nge$": 30909, "$gneq$": 30910, "$\\left\\downarrow$": 30911, "$subsetneqq$": 30912, "$idotsint$": 30913, "$nless$": 30914, "$nprec$": 30915, "$nwarrow$": 30916, "$circlearrowright$": 30917, "$ominus$": 30918, "$\\left/$": 30919, "$gtrsim$": 30920, "$succcurlyeq$": 30921, "$sqsubseteq$": 30922, "$backsim$": 30923, "$dddot$": 30924, "$precsim$": 30925, "$divideontimes$": 30926, "$triangleright$": 30927, "$sqcap$": 30928, "$\\right )$": 30929, "$lessdot$": 30930, "$nsupseteq$": 30931, "$nleqslant$": 30932, "$\\left /$": 30933, "$varsubsetneq$": 30934, "$\\rbrack$": 30935, "$rightharpoondown$": 30936, "$\\right .$": 30937, "$succsim$": 30938, "$leftleftarrows$": 30939, "$varsupsetneq$": 30940, "$iddots$": 30941, "$vargamma$": 30942, "$sqsubset$": 30943, "$downharpoonright$": 30944, "$sqsupseteq$": 30945, "$varsigma$": 30946, "$\\right .$": 30947, "$veebar$": 30948, "$ddddot$": 30949, "$circlearrowleft$": 30950, "$leftrightarrows$": 30951, "$\\right )$": 30952, "$thickapprox$": 30953, "$npreceq$": 30954, "$biguplus$": 30955, "$supsetneqq$": 30956, "$rrightarrow$": 30957, "$smallfrown$": 30958, "$multimap$": 30959, "$subseteqq$": 30960, "$hookleftarrow$": 30961, "$rightleftarrows$": 30962, "$nparallel$": 30963, "$nsucc$": 30964, "$ggg$": 30965, "$vartriangleright$": 30966, "$varpropto$": 30967, "$\\left ($": 30968, "$\\right ]$": 30969, "$\\left [$": 30970, "$\\left \\lgroup$": 30971, "$\\right \\rgroup$": 30972, "$\\left \\$": 30973, "$leftrightsquigarrow$": 30974, "$\\right )$": 30975, "$lsh$": 30976, "$\\lgroup$": 30977, "$\\rgroup$": 30978, "$mho$": 30979, "$upharpoonleft$": 30980, "$eth$": 30981, "$nleftarrow$": 30982, "$smallsmile$": 30983, "$nleftrightarrow$": 30984, "$iiiint$": 30985, "$gneqq$": 30986, "$lneqq$": 30987, "$downdownarrows$": 30988, "$varkappa$": 30989, "$backsimeq$": 30990, "$barwedge$": 30991, "$unrhd$": 30992, "$bowtie$": 30993, "$backepsilon$": 30994, "$bracevert$": 30995, "$\\right \\rfloor$": 30996, "$\\left \\lfloor$": 30997, "$precneqq$": 30998, "$\\left\\lgroup$": 30999, "$\\right\\rgroup$": 31000, "$\\left \\langle$": 31001, "$\\lmoustache$": 31002, "$\\right \\}$": 31003, "$\\right |$": 31004, "$curlywedge$": 31005, "$\\right ]$": 31006, "$gtreqqless$": 31007, "$gtreqless$": 31008, "$ngeqslant$": 31009, "$between$": 31010, "$dotplus$": 31011, "$leftrightharpoons$": 31012, "$\\right \\rceil$": 31013, "$leftharpoondown$": 31014, "$projlim$": 31015, "$\\right )$": 31016, "$varsubsetneqq$": 31017, "$lvect$": 31018, "$\\left\\backslash$": 31019, "$supseteqq$": 31020, "$join$": 31021, "$injlim$": 31022, "$eqsim$": 31023, "$curvearrowleft$": 31024, "$succneqq$": 31025, "$\\left\\uparrow$": 31026, "$leftarrowtail$": 31027, "$upuparrows$": 31028, "$gvertneqq$": 31029, "$lvertneqq$": 31030, "$nsucceq$": 31031, "$gtrdot$": 31032, "$eqslantless$": 31033, "$\\right \\$": 31034, "$circeq$": 31035, "$\\left ($": 31036, "$curlyvee$": 31037, "$\\right \\}$": 31038, "$\\right \\rangle$": 31039, "$downharpoonleft$": 31040, "$\\left ($": 31041, "$\\right \\rangle$": 31042, "$\\left \\{$": 31043, "$\\left\\updownarrow$": 31044, "$\\right )$": 31045, "$\\right >$": 31046, "$leftharpoonup$": 31047, "$curlyeqprec$": 31048, "$\\left [$": 31049, "$eqslantgtr$": 31050, "$varxi$": 31051, "$\\right \\rfloor$": 31052, "$\\right \\}$": 31053, "$varpsi$": 31054, "$\\left .$": 31055, "$\\left |$": 31056, "$\\right /$": 31057, "$\\left ($": 31058, "$\\left \\$": 31059, "$rsh$": 31060}
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
3
+ "architectures": [
4
+ "BertForPreTraining"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.9.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 31061
25
+ }
events.out.tfevents.1631471189.blg4302.int.ets1.calculquebec.ca.240020.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2caf095b1b4f07fc75087fdecc23f9c0e9edf3201ed001624f4fc06c9a8a83e7
3
+ size 53784629
job-25031358-head.out ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ + TRAINER=pretrain
2
+ + SETUP=for-newvocab
3
+ ++ cd pya0
4
+ ++ pwd
5
+ ++ git rev-parse HEAD
6
+ + CODE_VER='/home/w32zhong/projects/rrg-jimmylin/w32zhong/pya0
7
+ 8f207c0036a9f81f91e26f7ecedcfa84025ae680'
8
+ + COMMAND='/var/spool/slurmd/job25031358/slurm_script pretrain for-newvocab'
9
+ + EPOCHS=40
10
+ + TEST_CYCLE=100
11
+ + case $TRAINER-${SETUP} in
12
+ + DEV_BSIZE=8
13
+ + SAVE_FOLD=10
14
+ + DATA_VER=arjmPWtGwzKrkmR
15
+ + START_POINT=bert-base-uncased
16
+ + TOK_CKPOINT=bert-tokenizer
17
+ + SHARDS_LIST=shards-for-newvocab.txt
18
+ + TEST_FILE=test.txt
19
+ + EXTRA_DAT=mse-aops-2021-vocab.pkl
20
+ + EXTRA_ARG=
21
+ + DATA_DIR=data.arjmPWtGwzKrkmR
22
+ + set -e
23
+ + '[' '!' -e data.arjmPWtGwzKrkmR ']'
24
+ + set +e
25
+ ++ cat /var/spool/slurmd/job25031358/slurm_script
26
+ ++ grep -Po '(?<=SBATCH --nodes=)[0-9]+'
27
+ + N_NODE=4
28
+ ++ cat /var/spool/slurmd/job25031358/slurm_script
29
+ ++ grep -Po '(?<=SBATCH --gres=gpu:)[0-9]+'
30
+ + N_GPUS=2
31
+ + export NCCL_BLOCKING_WAIT=1
32
+ + NCCL_BLOCKING_WAIT=1
33
+ + export SLURM_ACCOUNT=def-jimmylin
34
+ + SLURM_ACCOUNT=def-jimmylin
35
+ + export SBATCH_ACCOUNT=def-jimmylin
36
+ + SBATCH_ACCOUNT=def-jimmylin
37
+ + export SALLOC_ACCOUNT=def-jimmylin
38
+ + SALLOC_ACCOUNT=def-jimmylin
39
+ + which srun
40
+ /opt/software/slurm/bin/srun
41
+ ++ hostname
42
+ + srun --unbuffered python ./pya0/utils/transformer.py pretrain data.arjmPWtGwzKrkmR/bert-base-uncased data.arjmPWtGwzKrkmR/bert-tokenizer data.arjmPWtGwzKrkmR/mse-aops-2021-vocab.pkl --test_file data.arjmPWtGwzKrkmR/test.txt --test_cycle 100 --shards_list data.arjmPWtGwzKrkmR/shards-for-newvocab.txt --cluster tcp://blg4302.int.ets1.calculquebec.ca:8912 --batch_size 64 --save_fold 10 --epochs 40
43
+ Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
44
+ {
45
+ "_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
46
+ "add_cross_attention": false,
47
+ "architectures": [
48
+ "BertForPreTraining"
49
+ ],
50
+ "attention_probs_dropout_prob": 0.1,
51
+ "bad_words_ids": null,
52
+ "bos_token_id": null,
53
+ "chunk_size_feed_forward": 0,
54
+ "decoder_start_token_id": null,
55
+ "diversity_penalty": 0.0,
56
+ "do_sample": false,
57
+ "early_stopping": false,
58
+ "encoder_no_repeat_ngram_size": 0,
59
+ "eos_token_id": null,
60
+ "finetuning_task": null,
61
+ "forced_bos_token_id": null,
62
+ "forced_eos_token_id": null,
63
+ "gradient_checkpointing": false,
64
+ "hidden_act": "gelu",
65
+ "hidden_dropout_prob": 0.1,
66
+ "hidden_size": 768,
67
+ "id2label": {
68
+ "0": "LABEL_0",
69
+ "1": "LABEL_1"
70
+ },
71
+ "initializer_range": 0.02,
72
+ "intermediate_size": 3072,
73
+ "is_decoder": false,
74
+ "is_encoder_decoder": false,
75
+ "label2id": {
76
+ "LABEL_0": 0,
77
+ "LABEL_1": 1
78
+ },
79
+ "layer_norm_eps": 1e-12,
80
+ "length_penalty": 1.0,
81
+ "max_length": 20,
82
+ "max_position_embeddings": 512,
83
+ "min_length": 0,
84
+ "model_type": "bert",
85
+ "no_repeat_ngram_size": 0,
86
+ "num_attention_heads": 12,
87
+ "num_beam_groups": 1,
88
+ "num_beams": 1,
89
+ "num_hidden_layers": 12,
90
+ "num_return_sequences": 1,
91
+ "output_attentions": false,
92
+ "output_hidden_states": false,
93
+ "output_scores": false,
94
+ "pad_token_id": 0,
95
+ "position_embedding_type": "absolute",
96
+ "prefix": null,
97
+ "problem_type": null,
98
+ "pruned_heads": {},
99
+ "remove_invalid_values": false,
100
+ "repetition_penalty": 1.0,
101
+ "return_dict": true,
102
+ "return_dict_in_generate": false,
103
+ "sep_token_id": null,
104
+ "task_specific_params": null,
105
+ "temperature": 1.0,
106
+ "tie_encoder_decoder": false,
107
+ "tie_word_embeddings": true,
108
+ "tokenizer_class": null,
109
+ "top_k": 50,
110
+ "top_p": 1.0,
111
+ "torch_dtype": null,
112
+ "torchscript": false,
113
+ "transformers_version": "4.9.2",
114
+ "type_vocab_size": 2,
115
+ "use_bfloat16": false,
116
+ "use_cache": true,
117
+ "vocab_size": 30522
118
+ }
119
+
120
+ Before loading new vocabulary: 30522
121
+ After loading new vocabulary: 31061
122
+ Resize model embedding and save new tokenizer ...
123
+ Invoke training ...
124
+ [caller] pretrain
125
+ [node#3 rank#6] Training on device cuda:0
126
+ [node#3 rank#6] 2 x Tesla V100-SXM2-16GB: 0%
127
+ [node#3 rank#7] Training on device cuda:1
128
+ [node#3 rank#7] 2 x Tesla V100-SXM2-16GB: 5%
129
+ [node#3 rank#6] Initialized process group ...
130
+ [node#3 rank#7] Initialized process group ...
131
+ Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
132
+ Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
133
+ Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
134
+ {
135
+ "_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
136
+ "add_cross_attention": false,
137
+ "architectures": [
138
+ "BertForPreTraining"
139
+ ],
140
+ "attention_probs_dropout_prob": 0.1,
141
+ "bad_words_ids": null,
142
+ "bos_token_id": null,
143
+ "chunk_size_feed_forward": 0,
144
+ "decoder_start_token_id": null,
145
+ "diversity_penalty": 0.0,
146
+ "do_sample": false,
147
+ "early_stopping": false,
148
+ "encoder_no_repeat_ngram_size": 0,
149
+ "eos_token_id": null,
150
+ "finetuning_task": null,
151
+ "forced_bos_token_id": null,
152
+ "forced_eos_token_id": null,
153
+ "gradient_checkpointing": false,
154
+ "hidden_act": "gelu",
155
+ "hidden_dropout_prob": 0.1,
156
+ "hidden_size": 768,
157
+ "id2label": {
158
+ "0": "LABEL_0",
159
+ "1": "LABEL_1"
160
+ },
161
+ "initializer_range": 0.02,
162
+ "intermediate_size": 3072,
163
+ "is_decoder": false,
164
+ "is_encoder_decoder": false,
165
+ "label2id": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1
168
+ },
169
+ "layer_norm_eps": 1e-12,
170
+ "length_penalty": 1.0,
171
+ "max_length": 20,
172
+ "max_position_embeddings": 512,
173
+ "min_length": 0,
174
+ "model_type": "bert",
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 12,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_hidden_layers": 12,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad_token_id": 0,
185
+ "position_embedding_type": "absolute",
186
+ "prefix": null,
187
+ "problem_type": null,
188
+ "pruned_heads": {},
189
+ "remove_invalid_values": false,
190
+ "repetition_penalty": 1.0,
191
+ "return_dict": true,
192
+ "return_dict_in_generate": false,
193
+ "sep_token_id": null,
194
+ "task_specific_params": null,
195
+ "temperature": 1.0,
196
+ "tie_encoder_decoder": false,
197
+ "tie_word_embeddings": true,
198
+ "tokenizer_class": null,
199
+ "top_k": 50,
200
+ "top_p": 1.0,
201
+ "torch_dtype": null,
202
+ "torchscript": false,
203
+ "transformers_version": "4.9.2",
204
+ "type_vocab_size": 2,
205
+ "use_bfloat16": false,
206
+ "use_cache": true,
207
+ "vocab_size": 30522
208
+ }
209
+
210
+ {
211
+ "_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
212
+ "add_cross_attention": false,
213
+ "architectures": [
214
+ "BertForPreTraining"
215
+ ],
216
+ "attention_probs_dropout_prob": 0.1,
217
+ "bad_words_ids": null,
218
+ "bos_token_id": null,
219
+ "chunk_size_feed_forward": 0,
220
+ "decoder_start_token_id": null,
221
+ "diversity_penalty": 0.0,
222
+ "do_sample": false,
223
+ "early_stopping": false,
224
+ "encoder_no_repeat_ngram_size": 0,
225
+ "eos_token_id": null,
226
+ "finetuning_task": null,
227
+ "forced_bos_token_id": null,
228
+ "forced_eos_token_id": null,
229
+ "gradient_checkpointing": false,
230
+ "hidden_act": "gelu",
231
+ "hidden_dropout_prob": 0.1,
232
+ "hidden_size": 768,
233
+ "id2label": {
234
+ "0": "LABEL_0",
235
+ "1": "LABEL_1"
236
+ },
237
+ "initializer_range": 0.02,
238
+ "intermediate_size": 3072,
239
+ "is_decoder": false,
240
+ "is_encoder_decoder": false,
241
+ "label2id": {
242
+ "LABEL_0": 0,
243
+ "LABEL_1": 1
244
+ },
245
+ "layer_norm_eps": 1e-12,
246
+ "length_penalty": 1.0,
247
+ "max_length": 20,
248
+ "max_position_embeddings": 512,
249
+ "min_length": 0,
250
+ "model_type": "bert",
251
+ "no_repeat_ngram_size": 0,
252
+ "num_attention_heads": 12,
253
+ "num_beam_groups": 1,
254
+ "num_beams": 1,
255
+ "num_hidden_layers": 12,
256
+ "num_return_sequences": 1,
257
+ "output_attentions": false,
258
+ "output_hidden_states": false,
259
+ "output_scores": false,
260
+ "pad_token_id": 0,
261
+ "position_embedding_type": "absolute",
262
+ "prefix": null,
263
+ "problem_type": null,
264
+ "pruned_heads": {},
265
+ "remove_invalid_values": false,
266
+ "repetition_penalty": 1.0,
267
+ "return_dict": true,
268
+ "return_dict_in_generate": false,
269
+ "sep_token_id": null,
270
+ "task_specific_params": null,
271
+ "temperature": 1.0,
272
+ "tie_encoder_decoder": false,
273
+ "tie_word_embeddings": true,
274
+ "tokenizer_class": null,
275
+ "top_k": 50,
276
+ "top_p": 1.0,
277
+ "torch_dtype": null,
278
+ "torchscript": false,
279
+ "transformers_version": "4.9.2",
280
+ "type_vocab_size": 2,
281
+ "use_bfloat16": false,
282
+ "use_cache": true,
283
+ "vocab_size": 30522
284
+ }
285
+
286
+ {
287
+ "_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
288
+ "add_cross_attention": false,
289
+ "architectures": [
290
+ "BertForPreTraining"
291
+ ],
292
+ "attention_probs_dropout_prob": 0.1,
293
+ "bad_words_ids": null,
294
+ "bos_token_id": null,
295
+ "chunk_size_feed_forward": 0,
296
+ "decoder_start_token_id": null,
297
+ "diversity_penalty": 0.0,
298
+ "do_sample": false,
299
+ "early_stopping": false,
300
+ "encoder_no_repeat_ngram_size": 0,
301
+ "eos_token_id": null,
302
+ "finetuning_task": null,
303
+ "forced_bos_token_id": null,
304
+ "forced_eos_token_id": null,
305
+ "gradient_checkpointing": false,
306
+ "hidden_act": "gelu",
307
+ "hidden_dropout_prob": 0.1,
308
+ "hidden_size": 768,
309
+ "id2label": {
310
+ "0": "LABEL_0",
311
+ "1": "LABEL_1"
312
+ },
313
+ "initializer_range": 0.02,
314
+ "intermediate_size": 3072,
315
+ "is_decoder": false,
316
+ "is_encoder_decoder": false,
317
+ "label2id": {
318
+ "LABEL_0": 0,
319
+ "LABEL_1": 1
320
+ },
321
+ "layer_norm_eps": 1e-12,
322
+ "length_penalty": 1.0,
323
+ "max_length": 20,
324
+ "max_position_embeddings": 512,
325
+ "min_length": 0,
326
+ "model_type": "bert",
327
+ "no_repeat_ngram_size": 0,
328
+ "num_attention_heads": 12,
329
+ "num_beam_groups": 1,
330
+ "num_beams": 1,
331
+ "num_hidden_layers": 12,
332
+ "num_return_sequences": 1,
333
+ "output_attentions": false,
334
+ "output_hidden_states": false,
335
+ "output_scores": false,
336
+ "pad_token_id": 0,
337
+ "position_embedding_type": "absolute",
338
+ "prefix": null,
339
+ "problem_type": null,
340
+ "pruned_heads": {},
341
+ "remove_invalid_values": false,
342
+ "repetition_penalty": 1.0,
343
+ "return_dict": true,
344
+ "return_dict_in_generate": false,
345
+ "sep_token_id": null,
346
+ "task_specific_params": null,
347
+ "temperature": 1.0,
348
+ "tie_encoder_decoder": false,
349
+ "tie_word_embeddings": true,
350
+ "tokenizer_class": null,
351
+ "top_k": 50,
352
+ "top_p": 1.0,
353
+ "torch_dtype": null,
354
+ "torchscript": false,
355
+ "transformers_version": "4.9.2",
356
+ "type_vocab_size": 2,
357
+ "use_bfloat16": false,
358
+ "use_cache": true,
359
+ "vocab_size": 30522
360
+ }
361
+
362
+ Before loading new vocabulary: 30522
363
+ Before loading new vocabulary: 30522
364
+ After loading new vocabulary: 31061
365
+ Resize model embedding and save new tokenizer ...
366
+ Before loading new vocabulary: 30522
367
+ After loading new vocabulary: 31061
368
+ Resize model embedding and save new tokenizer ...
369
+ After loading new vocabulary: 31061
370
+ Resize model embedding and save new tokenizer ...
371
+ Invoke training ...
372
+ Invoke training ...
373
+ Invoke training ...
374
+ [caller] pretrain
375
+ [caller] pretrain
376
+ [caller] pretrain
377
+ [node#0 rank#0] Training on device cuda:0
378
+ [node#2 rank#4] Training on device cuda:0
379
+ [node#0 rank#0] 2 x Tesla V100-SXM2-16GB: 0%
380
+ [node#2 rank#4] 2 x Tesla V100-SXM2-16GB: 0%
381
+ [node#1 rank#2] Training on device cuda:0
382
+ [node#1 rank#2] 2 x Tesla V100-SXM2-16GB: 0%
383
+ [node#0 rank#1] Training on device cuda:1
384
+ [node#2 rank#5] Training on device cuda:1
385
+ [node#0 rank#1] 2 x Tesla V100-SXM2-16GB: 2%
386
+ [node#2 rank#5] 2 x Tesla V100-SXM2-16GB: 2%
387
+ [node#1 rank#3] Training on device cuda:1
388
+ [node#1 rank#3] 2 x Tesla V100-SXM2-16GB: 2%
389
+ [node#2 rank#4] Initialized process group ...
390
+ [node#0 rank#0] Initialized process group ...
391
+ [node#1 rank#2] Initialized process group ...
392
+ [node#0 rank#1] Initialized process group ...
393
+ [node#2 rank#5] Initialized process group ...
394
+ [node#1 rank#3] Initialized process group ...
395
+ [node#0 rank#0] Enter Torch DDP.
396
+ [W ProcessGroupNCCL.cpp:1569] Rank 0 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
397
+ [node#2 rank#4] Enter Torch DDP.
398
+ [node#0 rank#1] Enter Torch DDP.
399
+ [W ProcessGroupNCCL.cpp:1569] Rank 1 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
400
+ [W ProcessGroupNCCL.cpp:1569] Rank 4 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
401
+ [node#1 rank#3] Enter Torch DDP.
402
+ [W ProcessGroupNCCL.cpp:1569] Rank 3 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
403
+ [node#3 rank#7] Enter Torch DDP.
404
+ [W ProcessGroupNCCL.cpp:1569] Rank 7 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
405
+ [node#2 rank#5] Enter Torch DDP.
406
+ [W ProcessGroupNCCL.cpp:1569] Rank 5 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
407
+ [node#3 rank#6] Enter Torch DDP.
408
+ [W ProcessGroupNCCL.cpp:1569] Rank 6 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
409
+ [node#1 rank#2] Enter Torch DDP.
410
+ [W ProcessGroupNCCL.cpp:1569] Rank 2 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
411
+ AdamW (
412
+ Parameter Group 0
413
+ betas: (0.9, 0.999)
414
+ correct_bias: True
415
+ eps: 1e-06
416
+ lr: 1e-06
417
+ weight_decay: 0.01
418
+ )
419
+ AdamW (
420
+ Parameter Group 0
421
+ betas: (0.9, 0.999)
422
+ correct_bias: True
423
+ eps: 1e-06
424
+ lr: 1e-06
425
+ weight_decay: 0.01
426
+ )
427
+ AdamW (
428
+ Parameter Group 0
429
+ betas: (0.9, 0.999)
430
+ correct_bias: True
431
+ eps: 1e-06
432
+ lr: 1e-06
433
+ weight_decay: 0.01
434
+ )
435
+ AdamW (
436
+ Parameter Group 0
437
+ betas: (0.9, 0.999)
438
+ correct_bias: True
439
+ eps: 1e-06
440
+ lr: 1e-06
441
+ weight_decay: 0.01
442
+ )
443
+ AdamW (
444
+ Parameter Group 0
445
+ betas: (0.9, 0.999)
446
+ correct_bias: True
447
+ eps: 1e-06
448
+ lr: 1e-06
449
+ weight_decay: 0.01
450
+ )
451
+ [node#3 rank#7] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
452
+ [node#3 rank#7] Start training at: (0, 0, -1)
453
+ AdamW (
454
+ Parameter Group 0
455
+ betas: (0.9, 0.999)
456
+ correct_bias: True
457
+ eps: 1e-06
458
+ lr: 1e-06
459
+ weight_decay: 0.01
460
+ )
461
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
462
+ [node#3 rank#6] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
463
+ [node#3 rank#6] Start training at: (0, 0, -1)
464
+ [node#3 rank#7] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
465
+ AdamW (
466
+ Parameter Group 0
467
+ betas: (0.9, 0.999)
468
+ correct_bias: True
469
+ eps: 1e-06
470
+ lr: 1e-06
471
+ weight_decay: 0.01
472
+ )
473
+ AdamW (
474
+ Parameter Group 0
475
+ betas: (0.9, 0.999)
476
+ correct_bias: True
477
+ eps: 1e-06
478
+ lr: 1e-06
479
+ weight_decay: 0.01
480
+ )
481
+ [node#1 rank#2] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
482
+ [node#1 rank#2] Start training at: (0, 0, -1)
483
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
484
+ [node#3 rank#6] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
485
+ [node#1 rank#3] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
486
+ [node#1 rank#3] Start training at: (0, 0, -1)
487
+ [node#2 rank#4] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
488
+ [node#2 rank#4] Start training at: (0, 0, -1)
489
+ [node#2 rank#5] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
490
+ [node#2 rank#5] Start training at: (0, 0, -1)
491
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
492
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
493
+ [node#2 rank#4] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
494
+ [node#2 rank#5] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
495
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
496
+ Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
497
+ [node#1 rank#3] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
498
+ [node#1 rank#2] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
499
+ [node#0 rank#1] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
500
+ [node#0 rank#1] Start training at: (0, 0, -1)
job-25031358-tail.out ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e609127390ca34a2bdc307d1f66fc68b056d1550973adbee02979df8e091162e
3
+ size 442169891
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff