update model
Browse files- .gitattributes +2 -0
- added_tokens.json +1 -0
- config.json +25 -0
- events.out.tfevents.1631471189.blg4302.int.ets1.calculquebec.ca.240020.0 +3 -0
- job-25031358-head.out +500 -0
- job-25031358-tail.out +0 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- vocab.txt +0 -0
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
events.out.tfevents.1631471189.blg4302.int.ets1.calculquebec.ca.240020.0 filter=lfs diff=lfs merge=lfs -text
|
2 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"$pi$": 30522, "$vert$": 30523, "$alpha$": 30524, "$minus$": 30525, "$p$": 30526, "$frac$": 30527, "$q$": 30528, "$lt$": 30529, "$1$": 30530, "$($": 30531, "$2$": 30532, "$supscript$": 30533, "$)$": 30534, "$varepsilon$": 30535, "$x$": 30536, "$subscript$": 30537, "${$": 30538, "$n$": 30539, "$}$": 30540, "$v$": 30541, "$a$": 30542, "$r$": 30543, "$rightarrow$": 30544, "$infty$": 30545, "$omega$": 30546, "$equal$": 30547, "$[$": 30548, "$comma$": 30549, "$]$": 30550, "$\\begin{cases}$": 30551, "$column$": 30552, "$0$": 30553, "$le$": 30554, "$row$": 30555, "$ast$": 30556, "$\\end{cases}$": 30557, "$qquad$": 30558, "$in$": 30559, "$e$": 30560, "$to$": 30561, "$l$": 30562, "$phi$": 30563, "$colon$": 30564, "$z$": 30565, "$i$": 30566, "$5$": 30567, "$plus$": 30568, "$cong$": 30569, "$b$": 30570, "$mod$": 30571, "$ker$": 30572, "$mapsto$": 30573, "$equiv$": 30574, "$subset$": 30575, "$f$": 30576, "$y$": 30577, "$sin$": 30578, "$times$": 30579, "$cos$": 30580, "$t$": 30581, "$float$": 30582, "$fact$": 30583, "$3$": 30584, "$dots$": 30585, "$k$": 30586, "$sum$": 30587, "$gt$": 30588, "$\\left($": 30589, "$\\right)$": 30590, "$m$": 30591, "$ldots$": 30592, "$9$": 30593, "$leftarrow$": 30594, "$prime$": 30595, "$h$": 30596, "$approx$": 30597, "$root$": 30598, "$pm$": 30599, "$c$": 30600, "$d$": 30601, "$beta$": 30602, "$ge$": 30603, "$g$": 30604, "$epsilon$": 30605, "$j$": 30606, "$u$": 30607, "$aleph$": 30608, "$neq$": 30609, "$cdots$": 30610, "$lambda$": 30611, "$mu$": 30612, "$nu$": 30613, "$geq$": 30614, "$\\{$": 30615, "$\\}$": 30616, "$s$": 30617, "$subseteq$": 30618, "$max$": 30619, "$kappa$": 30620, "$oplus$": 30621, "$dim$": 30622, "$prod$": 30623, "$8$": 30624, "$somenum$": 30625, "$circ$": 30626, "$4$": 30627, "$7$": 30628, "$forall$": 30629, "$ne$": 30630, "$o$": 30631, "$w$": 30632, "$arg$": 30633, "$exp$": 30634, "$6$": 30635, "$\\begin{pmatrix}$": 30636, "$\\end{pmatrix}$": 30637, "$theta$": 30638, "$tan$": 30639, "$semicolon$": 30640, "$quad$": 30641, "$newline$": 30642, "$ni$": 30643, "$cup$": 30644, "$varpi$": 30645, "$vee$": 30646, "$langle$": 30647, "$rangle$": 30648, "$delta$": 30649, "$gg$": 30650, "$log$": 30651, "$\\lceil$": 30652, "$\\rceil$": 30653, "$\\begin{array}$": 30654, "$\\end{array}$": 30655, "$pr$": 30656, "$bigcap$": 30657, "$sim$": 30658, "$lim$": 30659, "$\\left [$": 30660, "$\\right ]$": 30661, "$\\left ($": 30662, "$\\right )$": 30663, "$longmapsto$": 30664, "$rvect$": 30665, "$rho$": 30666, "$int$": 30667, "$nabla$": 30668, "$sigma$": 30669, "$cap$": 30670, "$iff$": 30671, "$\\over$": 30672, "$wedge$": 30673, "$bigoplus$": 30674, "$otimes$": 30675, "$partial$": 30676, "$\\begin{bmatrix}$": 30677, "$\\end{bmatrix}$": 30678, "$\\left\\langle$": 30679, "$\\right\\rangle$": 30680, "$\\left\\$": 30681, "$\\right\\$": 30682, "$\\left|$": 30683, "$\\right|$": 30684, "$chi$": 30685, "$\\left.$": 30686, "$psi$": 30687, "$gamma$": 30688, "$cot$": 30689, "$\\left[$": 30690, "$\\right]$": 30691, "$zeta$": 30692, "$xrightarrow$": 30693, "$ln$": 30694, "$setminus$": 30695, "$unlhd$": 30696, "$xi$": 30697, "$ll$": 30698, "$implies$": 30699, "$uparrow$": 30700, "$\\underbrace$": 30701, "$cr$": 30702, "$longleftrightarrow$": 30703, "$tau$": 30704, "$mid$": 30705, "$varphi$": 30706, "$geqslant$": 30707, "$angle$": 30708, "$longrightarrow$": 30709, "$exists$": 30710, "$inf$": 30711, "$sup$": 30712, "$\\begin{matrix}$": 30713, "$\\end{matrix}$": 30714, "$top$": 30715, "$bot$": 30716, "$simeq$": 30717, "$det$": 30718, "$\\right/$": 30719, "$\\overset$": 30720, "$emptyset$": 30721, "$\\stackrel$": 30722, "$\\left\\{$": 30723, "$\\lfloor$": 30724, "$\\rfloor$": 30725, "$\\right.$": 30726, "$binom$": 30727, "$ell$": 30728, "$sec$": 30729, "$arccos$": 30730, "$vdots$": 30731, "$ddots$": 30732, "$\\right\\}$": 30733, "$leqslant$": 30734, "$eta$": 30735, "$\\begin{smallmatrix}$": 30736, "$\\end{smallmatrix}$": 30737, "$percent$": 30738, "$oint$": 30739, "$min$": 30740, "$hbar$": 30741, "$ddot$": 30742, "$varnothing$": 30743, "$cosh$": 30744, "$downarrow$": 30745, "$gcd$": 30746, "$rightarrowtail$": 30747, "$supset$": 30748, "$\\underset$": 30749, "$bigcup$": 30750, "$preceq$": 30751, "$\\array{$": 30752, "$triangleq$": 30753, "$iota$": 30754, "$leftrightarrow$": 30755, "$arctan$": 30756, "$arcsin$": 30757, "$sinh$": 30758, "$triangle$": 30759, "$coprod$": 30760, "$neg$": 30761, "$land$": 30762, "$lor$": 30763, "$measuredangle$": 30764, "$wp$": 30765, "$backslash$": 30766, "$vartheta$": 30767, "$odot$": 30768, "$perp$": 30769, "$tanh$": 30770, "$trianglelefteq$": 30771, "$\\left\\lfloor$": 30772, "$\\right\\rfloor$": 30773, "$supseteq$": 30774, "$sign$": 30775, "$dotsc$": 30776, "$nmid$": 30777, "$smallsetminus$": 30778, "$and$": 30779, "$\\left\\lceil$": 30780, "$\\right\\rceil$": 30781, "$deg$": 30782, "$impliedby$": 30783, "$\\left<$": 30784, "$\\right>$": 30785, "$searrow$": 30786, "$limsup$": 30787, "$succeq$": 30788, "$rightharpoonup$": 30789, "$bigtriangleup$": 30790, "$sqcup$": 30791, "$subsetneq$": 30792, "$\\left \\{$": 30793, "$\\right \\}$": 30794, "$\\left |$": 30795, "$\\right |$": 30796, "$nle$": 30797, "$lnot$": 30798, "$iint$": 30799, "$hom$": 30800, "$leadsto$": 30801, "$nexists$": 30802, "$re$": 30803, "$\\begin{vmatrix}$": 30804, "$\\end{vmatrix}$": 30805, "$parallel$": 30806, "$dotsb$": 30807, "$bigwedge$": 30808, "$succ$": 30809, "$\\buildrel$": 30810, "$liminf$": 30811, "$csc$": 30812, "$wr$": 30813, "$hookrightarrow$": 30814, "$\\lbrace$": 30815, "$\\rbrace$": 30816, "$dotsm$": 30817, "$rtimes$": 30818, "$ltimes$": 30819, "$\\of$": 30820, "$updownarrow$": 30821, "$nearrow$": 30822, "$\\left \\lfloor$": 30823, "$\\right \\rfloor$": 30824, "$approxeq$": 30825, "$dashv$": 30826, "$bigcirc$": 30827, "$triangledown$": 30828, "$lcm$": 30829, "$prec$": 30830, "$propto$": 30831, "$triangleleft$": 30832, "$ncong$": 30833, "$coth$": 30834, "$longleftarrow$": 30835, "$upsilon$": 30836, "$thicksim$": 30837, "$\\left$": 30838, "$\\right$": 30839, "$bigtriangledown$": 30840, "$varliminf$": 30841, "$varlimsup$": 30842, "$atop$": 30843, "$\\overbrace$": 30844, "$\\left \\langle$": 30845, "$\\right \\rangle$": 30846, "$gets$": 30847, "$vartriangleleft$": 30848, "$iiint$": 30849, "$varinjlim$": 30850, "$varprojlim$": 30851, "$bigotimes$": 30852, "$varrho$": 30853, "$lesssim$": 30854, "$\\left \\$": 30855, "$\\right \\$": 30856, "$bigsqcup$": 30857, "$supsetneq$": 30858, "$curvearrowright$": 30859, "$dotso$": 30860, "$preccurlyeq$": 30861, "$imath$": 30862, "$omicron$": 30863, "$nrightarrow$": 30864, "$lneq$": 30865, "$upharpoonright$": 30866, "$nsubseteq$": 30867, "$enspace$": 30868, "$geqq$": 30869, "$rightrightarrows$": 30870, "$uplus$": 30871, "$owns$": 30872, "$rightsquigarrow$": 30873, "$vartriangle$": 30874, "$trianglerighteq$": 30875, "$amalg$": 30876, "$frown$": 30877, "$jmath$": 30878, "$\\left <$": 30879, "$\\right >$": 30880, "$beth$": 30881, "$sphericalangle$": 30882, "$gtrapprox$": 30883, "$lessapprox$": 30884, "$bigvee$": 30885, "$nsim$": 30886, "$swarrow$": 30887, "$asymp$": 30888, "$bigodot$": 30889, "$dotsi$": 30890, "$\\left |$": 30891, "$intop$": 30892, "$sqsupset$": 30893, "$\\left \\lceil$": 30894, "$\\right \\rceil$": 30895, "$empty$": 30896, "$rightleftharpoons$": 30897, "$\\left .$": 30898, "$smallint$": 30899, "$gtrless$": 30900, "$ngtr$": 30901, "$\\lbrack$": 30902, "$arrowvert$": 30903, "$xleftarrow$": 30904, "$\\left \\{$": 30905, "$\\right \\}$": 30906, "$vardelta$": 30907, "$looparrowright$": 30908, "$nge$": 30909, "$gneq$": 30910, "$\\left\\downarrow$": 30911, "$subsetneqq$": 30912, "$idotsint$": 30913, "$nless$": 30914, "$nprec$": 30915, "$nwarrow$": 30916, "$circlearrowright$": 30917, "$ominus$": 30918, "$\\left/$": 30919, "$gtrsim$": 30920, "$succcurlyeq$": 30921, "$sqsubseteq$": 30922, "$backsim$": 30923, "$dddot$": 30924, "$precsim$": 30925, "$divideontimes$": 30926, "$triangleright$": 30927, "$sqcap$": 30928, "$\\right )$": 30929, "$lessdot$": 30930, "$nsupseteq$": 30931, "$nleqslant$": 30932, "$\\left /$": 30933, "$varsubsetneq$": 30934, "$\\rbrack$": 30935, "$rightharpoondown$": 30936, "$\\right .$": 30937, "$succsim$": 30938, "$leftleftarrows$": 30939, "$varsupsetneq$": 30940, "$iddots$": 30941, "$vargamma$": 30942, "$sqsubset$": 30943, "$downharpoonright$": 30944, "$sqsupseteq$": 30945, "$varsigma$": 30946, "$\\right .$": 30947, "$veebar$": 30948, "$ddddot$": 30949, "$circlearrowleft$": 30950, "$leftrightarrows$": 30951, "$\\right )$": 30952, "$thickapprox$": 30953, "$npreceq$": 30954, "$biguplus$": 30955, "$supsetneqq$": 30956, "$rrightarrow$": 30957, "$smallfrown$": 30958, "$multimap$": 30959, "$subseteqq$": 30960, "$hookleftarrow$": 30961, "$rightleftarrows$": 30962, "$nparallel$": 30963, "$nsucc$": 30964, "$ggg$": 30965, "$vartriangleright$": 30966, "$varpropto$": 30967, "$\\left ($": 30968, "$\\right ]$": 30969, "$\\left [$": 30970, "$\\left \\lgroup$": 30971, "$\\right \\rgroup$": 30972, "$\\left \\$": 30973, "$leftrightsquigarrow$": 30974, "$\\right )$": 30975, "$lsh$": 30976, "$\\lgroup$": 30977, "$\\rgroup$": 30978, "$mho$": 30979, "$upharpoonleft$": 30980, "$eth$": 30981, "$nleftarrow$": 30982, "$smallsmile$": 30983, "$nleftrightarrow$": 30984, "$iiiint$": 30985, "$gneqq$": 30986, "$lneqq$": 30987, "$downdownarrows$": 30988, "$varkappa$": 30989, "$backsimeq$": 30990, "$barwedge$": 30991, "$unrhd$": 30992, "$bowtie$": 30993, "$backepsilon$": 30994, "$bracevert$": 30995, "$\\right \\rfloor$": 30996, "$\\left \\lfloor$": 30997, "$precneqq$": 30998, "$\\left\\lgroup$": 30999, "$\\right\\rgroup$": 31000, "$\\left \\langle$": 31001, "$\\lmoustache$": 31002, "$\\right \\}$": 31003, "$\\right |$": 31004, "$curlywedge$": 31005, "$\\right ]$": 31006, "$gtreqqless$": 31007, "$gtreqless$": 31008, "$ngeqslant$": 31009, "$between$": 31010, "$dotplus$": 31011, "$leftrightharpoons$": 31012, "$\\right \\rceil$": 31013, "$leftharpoondown$": 31014, "$projlim$": 31015, "$\\right )$": 31016, "$varsubsetneqq$": 31017, "$lvect$": 31018, "$\\left\\backslash$": 31019, "$supseteqq$": 31020, "$join$": 31021, "$injlim$": 31022, "$eqsim$": 31023, "$curvearrowleft$": 31024, "$succneqq$": 31025, "$\\left\\uparrow$": 31026, "$leftarrowtail$": 31027, "$upuparrows$": 31028, "$gvertneqq$": 31029, "$lvertneqq$": 31030, "$nsucceq$": 31031, "$gtrdot$": 31032, "$eqslantless$": 31033, "$\\right \\$": 31034, "$circeq$": 31035, "$\\left ($": 31036, "$curlyvee$": 31037, "$\\right \\}$": 31038, "$\\right \\rangle$": 31039, "$downharpoonleft$": 31040, "$\\left ($": 31041, "$\\right \\rangle$": 31042, "$\\left \\{$": 31043, "$\\left\\updownarrow$": 31044, "$\\right )$": 31045, "$\\right >$": 31046, "$leftharpoonup$": 31047, "$curlyeqprec$": 31048, "$\\left [$": 31049, "$eqslantgtr$": 31050, "$varxi$": 31051, "$\\right \\rfloor$": 31052, "$\\right \\}$": 31053, "$varpsi$": 31054, "$\\left .$": 31055, "$\\left |$": 31056, "$\\right /$": 31057, "$\\left ($": 31058, "$\\left \\$": 31059, "$rsh$": 31060}
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForPreTraining"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.9.2",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 31061
|
25 |
+
}
|
events.out.tfevents.1631471189.blg4302.int.ets1.calculquebec.ca.240020.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2caf095b1b4f07fc75087fdecc23f9c0e9edf3201ed001624f4fc06c9a8a83e7
|
3 |
+
size 53784629
|
job-25031358-head.out
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
+ TRAINER=pretrain
|
2 |
+
+ SETUP=for-newvocab
|
3 |
+
++ cd pya0
|
4 |
+
++ pwd
|
5 |
+
++ git rev-parse HEAD
|
6 |
+
+ CODE_VER='/home/w32zhong/projects/rrg-jimmylin/w32zhong/pya0
|
7 |
+
8f207c0036a9f81f91e26f7ecedcfa84025ae680'
|
8 |
+
+ COMMAND='/var/spool/slurmd/job25031358/slurm_script pretrain for-newvocab'
|
9 |
+
+ EPOCHS=40
|
10 |
+
+ TEST_CYCLE=100
|
11 |
+
+ case $TRAINER-${SETUP} in
|
12 |
+
+ DEV_BSIZE=8
|
13 |
+
+ SAVE_FOLD=10
|
14 |
+
+ DATA_VER=arjmPWtGwzKrkmR
|
15 |
+
+ START_POINT=bert-base-uncased
|
16 |
+
+ TOK_CKPOINT=bert-tokenizer
|
17 |
+
+ SHARDS_LIST=shards-for-newvocab.txt
|
18 |
+
+ TEST_FILE=test.txt
|
19 |
+
+ EXTRA_DAT=mse-aops-2021-vocab.pkl
|
20 |
+
+ EXTRA_ARG=
|
21 |
+
+ DATA_DIR=data.arjmPWtGwzKrkmR
|
22 |
+
+ set -e
|
23 |
+
+ '[' '!' -e data.arjmPWtGwzKrkmR ']'
|
24 |
+
+ set +e
|
25 |
+
++ cat /var/spool/slurmd/job25031358/slurm_script
|
26 |
+
++ grep -Po '(?<=SBATCH --nodes=)[0-9]+'
|
27 |
+
+ N_NODE=4
|
28 |
+
++ cat /var/spool/slurmd/job25031358/slurm_script
|
29 |
+
++ grep -Po '(?<=SBATCH --gres=gpu:)[0-9]+'
|
30 |
+
+ N_GPUS=2
|
31 |
+
+ export NCCL_BLOCKING_WAIT=1
|
32 |
+
+ NCCL_BLOCKING_WAIT=1
|
33 |
+
+ export SLURM_ACCOUNT=def-jimmylin
|
34 |
+
+ SLURM_ACCOUNT=def-jimmylin
|
35 |
+
+ export SBATCH_ACCOUNT=def-jimmylin
|
36 |
+
+ SBATCH_ACCOUNT=def-jimmylin
|
37 |
+
+ export SALLOC_ACCOUNT=def-jimmylin
|
38 |
+
+ SALLOC_ACCOUNT=def-jimmylin
|
39 |
+
+ which srun
|
40 |
+
/opt/software/slurm/bin/srun
|
41 |
+
++ hostname
|
42 |
+
+ srun --unbuffered python ./pya0/utils/transformer.py pretrain data.arjmPWtGwzKrkmR/bert-base-uncased data.arjmPWtGwzKrkmR/bert-tokenizer data.arjmPWtGwzKrkmR/mse-aops-2021-vocab.pkl --test_file data.arjmPWtGwzKrkmR/test.txt --test_cycle 100 --shards_list data.arjmPWtGwzKrkmR/shards-for-newvocab.txt --cluster tcp://blg4302.int.ets1.calculquebec.ca:8912 --batch_size 64 --save_fold 10 --epochs 40
|
43 |
+
Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
|
44 |
+
{
|
45 |
+
"_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
|
46 |
+
"add_cross_attention": false,
|
47 |
+
"architectures": [
|
48 |
+
"BertForPreTraining"
|
49 |
+
],
|
50 |
+
"attention_probs_dropout_prob": 0.1,
|
51 |
+
"bad_words_ids": null,
|
52 |
+
"bos_token_id": null,
|
53 |
+
"chunk_size_feed_forward": 0,
|
54 |
+
"decoder_start_token_id": null,
|
55 |
+
"diversity_penalty": 0.0,
|
56 |
+
"do_sample": false,
|
57 |
+
"early_stopping": false,
|
58 |
+
"encoder_no_repeat_ngram_size": 0,
|
59 |
+
"eos_token_id": null,
|
60 |
+
"finetuning_task": null,
|
61 |
+
"forced_bos_token_id": null,
|
62 |
+
"forced_eos_token_id": null,
|
63 |
+
"gradient_checkpointing": false,
|
64 |
+
"hidden_act": "gelu",
|
65 |
+
"hidden_dropout_prob": 0.1,
|
66 |
+
"hidden_size": 768,
|
67 |
+
"id2label": {
|
68 |
+
"0": "LABEL_0",
|
69 |
+
"1": "LABEL_1"
|
70 |
+
},
|
71 |
+
"initializer_range": 0.02,
|
72 |
+
"intermediate_size": 3072,
|
73 |
+
"is_decoder": false,
|
74 |
+
"is_encoder_decoder": false,
|
75 |
+
"label2id": {
|
76 |
+
"LABEL_0": 0,
|
77 |
+
"LABEL_1": 1
|
78 |
+
},
|
79 |
+
"layer_norm_eps": 1e-12,
|
80 |
+
"length_penalty": 1.0,
|
81 |
+
"max_length": 20,
|
82 |
+
"max_position_embeddings": 512,
|
83 |
+
"min_length": 0,
|
84 |
+
"model_type": "bert",
|
85 |
+
"no_repeat_ngram_size": 0,
|
86 |
+
"num_attention_heads": 12,
|
87 |
+
"num_beam_groups": 1,
|
88 |
+
"num_beams": 1,
|
89 |
+
"num_hidden_layers": 12,
|
90 |
+
"num_return_sequences": 1,
|
91 |
+
"output_attentions": false,
|
92 |
+
"output_hidden_states": false,
|
93 |
+
"output_scores": false,
|
94 |
+
"pad_token_id": 0,
|
95 |
+
"position_embedding_type": "absolute",
|
96 |
+
"prefix": null,
|
97 |
+
"problem_type": null,
|
98 |
+
"pruned_heads": {},
|
99 |
+
"remove_invalid_values": false,
|
100 |
+
"repetition_penalty": 1.0,
|
101 |
+
"return_dict": true,
|
102 |
+
"return_dict_in_generate": false,
|
103 |
+
"sep_token_id": null,
|
104 |
+
"task_specific_params": null,
|
105 |
+
"temperature": 1.0,
|
106 |
+
"tie_encoder_decoder": false,
|
107 |
+
"tie_word_embeddings": true,
|
108 |
+
"tokenizer_class": null,
|
109 |
+
"top_k": 50,
|
110 |
+
"top_p": 1.0,
|
111 |
+
"torch_dtype": null,
|
112 |
+
"torchscript": false,
|
113 |
+
"transformers_version": "4.9.2",
|
114 |
+
"type_vocab_size": 2,
|
115 |
+
"use_bfloat16": false,
|
116 |
+
"use_cache": true,
|
117 |
+
"vocab_size": 30522
|
118 |
+
}
|
119 |
+
|
120 |
+
Before loading new vocabulary: 30522
|
121 |
+
After loading new vocabulary: 31061
|
122 |
+
Resize model embedding and save new tokenizer ...
|
123 |
+
Invoke training ...
|
124 |
+
[caller] pretrain
|
125 |
+
[node#3 rank#6] Training on device cuda:0
|
126 |
+
[node#3 rank#6] 2 x Tesla V100-SXM2-16GB: 0%
|
127 |
+
[node#3 rank#7] Training on device cuda:1
|
128 |
+
[node#3 rank#7] 2 x Tesla V100-SXM2-16GB: 5%
|
129 |
+
[node#3 rank#6] Initialized process group ...
|
130 |
+
[node#3 rank#7] Initialized process group ...
|
131 |
+
Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
|
132 |
+
Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
|
133 |
+
Loading model data.arjmPWtGwzKrkmR/bert-base-uncased...
|
134 |
+
{
|
135 |
+
"_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
|
136 |
+
"add_cross_attention": false,
|
137 |
+
"architectures": [
|
138 |
+
"BertForPreTraining"
|
139 |
+
],
|
140 |
+
"attention_probs_dropout_prob": 0.1,
|
141 |
+
"bad_words_ids": null,
|
142 |
+
"bos_token_id": null,
|
143 |
+
"chunk_size_feed_forward": 0,
|
144 |
+
"decoder_start_token_id": null,
|
145 |
+
"diversity_penalty": 0.0,
|
146 |
+
"do_sample": false,
|
147 |
+
"early_stopping": false,
|
148 |
+
"encoder_no_repeat_ngram_size": 0,
|
149 |
+
"eos_token_id": null,
|
150 |
+
"finetuning_task": null,
|
151 |
+
"forced_bos_token_id": null,
|
152 |
+
"forced_eos_token_id": null,
|
153 |
+
"gradient_checkpointing": false,
|
154 |
+
"hidden_act": "gelu",
|
155 |
+
"hidden_dropout_prob": 0.1,
|
156 |
+
"hidden_size": 768,
|
157 |
+
"id2label": {
|
158 |
+
"0": "LABEL_0",
|
159 |
+
"1": "LABEL_1"
|
160 |
+
},
|
161 |
+
"initializer_range": 0.02,
|
162 |
+
"intermediate_size": 3072,
|
163 |
+
"is_decoder": false,
|
164 |
+
"is_encoder_decoder": false,
|
165 |
+
"label2id": {
|
166 |
+
"LABEL_0": 0,
|
167 |
+
"LABEL_1": 1
|
168 |
+
},
|
169 |
+
"layer_norm_eps": 1e-12,
|
170 |
+
"length_penalty": 1.0,
|
171 |
+
"max_length": 20,
|
172 |
+
"max_position_embeddings": 512,
|
173 |
+
"min_length": 0,
|
174 |
+
"model_type": "bert",
|
175 |
+
"no_repeat_ngram_size": 0,
|
176 |
+
"num_attention_heads": 12,
|
177 |
+
"num_beam_groups": 1,
|
178 |
+
"num_beams": 1,
|
179 |
+
"num_hidden_layers": 12,
|
180 |
+
"num_return_sequences": 1,
|
181 |
+
"output_attentions": false,
|
182 |
+
"output_hidden_states": false,
|
183 |
+
"output_scores": false,
|
184 |
+
"pad_token_id": 0,
|
185 |
+
"position_embedding_type": "absolute",
|
186 |
+
"prefix": null,
|
187 |
+
"problem_type": null,
|
188 |
+
"pruned_heads": {},
|
189 |
+
"remove_invalid_values": false,
|
190 |
+
"repetition_penalty": 1.0,
|
191 |
+
"return_dict": true,
|
192 |
+
"return_dict_in_generate": false,
|
193 |
+
"sep_token_id": null,
|
194 |
+
"task_specific_params": null,
|
195 |
+
"temperature": 1.0,
|
196 |
+
"tie_encoder_decoder": false,
|
197 |
+
"tie_word_embeddings": true,
|
198 |
+
"tokenizer_class": null,
|
199 |
+
"top_k": 50,
|
200 |
+
"top_p": 1.0,
|
201 |
+
"torch_dtype": null,
|
202 |
+
"torchscript": false,
|
203 |
+
"transformers_version": "4.9.2",
|
204 |
+
"type_vocab_size": 2,
|
205 |
+
"use_bfloat16": false,
|
206 |
+
"use_cache": true,
|
207 |
+
"vocab_size": 30522
|
208 |
+
}
|
209 |
+
|
210 |
+
{
|
211 |
+
"_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
|
212 |
+
"add_cross_attention": false,
|
213 |
+
"architectures": [
|
214 |
+
"BertForPreTraining"
|
215 |
+
],
|
216 |
+
"attention_probs_dropout_prob": 0.1,
|
217 |
+
"bad_words_ids": null,
|
218 |
+
"bos_token_id": null,
|
219 |
+
"chunk_size_feed_forward": 0,
|
220 |
+
"decoder_start_token_id": null,
|
221 |
+
"diversity_penalty": 0.0,
|
222 |
+
"do_sample": false,
|
223 |
+
"early_stopping": false,
|
224 |
+
"encoder_no_repeat_ngram_size": 0,
|
225 |
+
"eos_token_id": null,
|
226 |
+
"finetuning_task": null,
|
227 |
+
"forced_bos_token_id": null,
|
228 |
+
"forced_eos_token_id": null,
|
229 |
+
"gradient_checkpointing": false,
|
230 |
+
"hidden_act": "gelu",
|
231 |
+
"hidden_dropout_prob": 0.1,
|
232 |
+
"hidden_size": 768,
|
233 |
+
"id2label": {
|
234 |
+
"0": "LABEL_0",
|
235 |
+
"1": "LABEL_1"
|
236 |
+
},
|
237 |
+
"initializer_range": 0.02,
|
238 |
+
"intermediate_size": 3072,
|
239 |
+
"is_decoder": false,
|
240 |
+
"is_encoder_decoder": false,
|
241 |
+
"label2id": {
|
242 |
+
"LABEL_0": 0,
|
243 |
+
"LABEL_1": 1
|
244 |
+
},
|
245 |
+
"layer_norm_eps": 1e-12,
|
246 |
+
"length_penalty": 1.0,
|
247 |
+
"max_length": 20,
|
248 |
+
"max_position_embeddings": 512,
|
249 |
+
"min_length": 0,
|
250 |
+
"model_type": "bert",
|
251 |
+
"no_repeat_ngram_size": 0,
|
252 |
+
"num_attention_heads": 12,
|
253 |
+
"num_beam_groups": 1,
|
254 |
+
"num_beams": 1,
|
255 |
+
"num_hidden_layers": 12,
|
256 |
+
"num_return_sequences": 1,
|
257 |
+
"output_attentions": false,
|
258 |
+
"output_hidden_states": false,
|
259 |
+
"output_scores": false,
|
260 |
+
"pad_token_id": 0,
|
261 |
+
"position_embedding_type": "absolute",
|
262 |
+
"prefix": null,
|
263 |
+
"problem_type": null,
|
264 |
+
"pruned_heads": {},
|
265 |
+
"remove_invalid_values": false,
|
266 |
+
"repetition_penalty": 1.0,
|
267 |
+
"return_dict": true,
|
268 |
+
"return_dict_in_generate": false,
|
269 |
+
"sep_token_id": null,
|
270 |
+
"task_specific_params": null,
|
271 |
+
"temperature": 1.0,
|
272 |
+
"tie_encoder_decoder": false,
|
273 |
+
"tie_word_embeddings": true,
|
274 |
+
"tokenizer_class": null,
|
275 |
+
"top_k": 50,
|
276 |
+
"top_p": 1.0,
|
277 |
+
"torch_dtype": null,
|
278 |
+
"torchscript": false,
|
279 |
+
"transformers_version": "4.9.2",
|
280 |
+
"type_vocab_size": 2,
|
281 |
+
"use_bfloat16": false,
|
282 |
+
"use_cache": true,
|
283 |
+
"vocab_size": 30522
|
284 |
+
}
|
285 |
+
|
286 |
+
{
|
287 |
+
"_name_or_path": "data.arjmPWtGwzKrkmR/bert-base-uncased",
|
288 |
+
"add_cross_attention": false,
|
289 |
+
"architectures": [
|
290 |
+
"BertForPreTraining"
|
291 |
+
],
|
292 |
+
"attention_probs_dropout_prob": 0.1,
|
293 |
+
"bad_words_ids": null,
|
294 |
+
"bos_token_id": null,
|
295 |
+
"chunk_size_feed_forward": 0,
|
296 |
+
"decoder_start_token_id": null,
|
297 |
+
"diversity_penalty": 0.0,
|
298 |
+
"do_sample": false,
|
299 |
+
"early_stopping": false,
|
300 |
+
"encoder_no_repeat_ngram_size": 0,
|
301 |
+
"eos_token_id": null,
|
302 |
+
"finetuning_task": null,
|
303 |
+
"forced_bos_token_id": null,
|
304 |
+
"forced_eos_token_id": null,
|
305 |
+
"gradient_checkpointing": false,
|
306 |
+
"hidden_act": "gelu",
|
307 |
+
"hidden_dropout_prob": 0.1,
|
308 |
+
"hidden_size": 768,
|
309 |
+
"id2label": {
|
310 |
+
"0": "LABEL_0",
|
311 |
+
"1": "LABEL_1"
|
312 |
+
},
|
313 |
+
"initializer_range": 0.02,
|
314 |
+
"intermediate_size": 3072,
|
315 |
+
"is_decoder": false,
|
316 |
+
"is_encoder_decoder": false,
|
317 |
+
"label2id": {
|
318 |
+
"LABEL_0": 0,
|
319 |
+
"LABEL_1": 1
|
320 |
+
},
|
321 |
+
"layer_norm_eps": 1e-12,
|
322 |
+
"length_penalty": 1.0,
|
323 |
+
"max_length": 20,
|
324 |
+
"max_position_embeddings": 512,
|
325 |
+
"min_length": 0,
|
326 |
+
"model_type": "bert",
|
327 |
+
"no_repeat_ngram_size": 0,
|
328 |
+
"num_attention_heads": 12,
|
329 |
+
"num_beam_groups": 1,
|
330 |
+
"num_beams": 1,
|
331 |
+
"num_hidden_layers": 12,
|
332 |
+
"num_return_sequences": 1,
|
333 |
+
"output_attentions": false,
|
334 |
+
"output_hidden_states": false,
|
335 |
+
"output_scores": false,
|
336 |
+
"pad_token_id": 0,
|
337 |
+
"position_embedding_type": "absolute",
|
338 |
+
"prefix": null,
|
339 |
+
"problem_type": null,
|
340 |
+
"pruned_heads": {},
|
341 |
+
"remove_invalid_values": false,
|
342 |
+
"repetition_penalty": 1.0,
|
343 |
+
"return_dict": true,
|
344 |
+
"return_dict_in_generate": false,
|
345 |
+
"sep_token_id": null,
|
346 |
+
"task_specific_params": null,
|
347 |
+
"temperature": 1.0,
|
348 |
+
"tie_encoder_decoder": false,
|
349 |
+
"tie_word_embeddings": true,
|
350 |
+
"tokenizer_class": null,
|
351 |
+
"top_k": 50,
|
352 |
+
"top_p": 1.0,
|
353 |
+
"torch_dtype": null,
|
354 |
+
"torchscript": false,
|
355 |
+
"transformers_version": "4.9.2",
|
356 |
+
"type_vocab_size": 2,
|
357 |
+
"use_bfloat16": false,
|
358 |
+
"use_cache": true,
|
359 |
+
"vocab_size": 30522
|
360 |
+
}
|
361 |
+
|
362 |
+
Before loading new vocabulary: 30522
|
363 |
+
Before loading new vocabulary: 30522
|
364 |
+
After loading new vocabulary: 31061
|
365 |
+
Resize model embedding and save new tokenizer ...
|
366 |
+
Before loading new vocabulary: 30522
|
367 |
+
After loading new vocabulary: 31061
|
368 |
+
Resize model embedding and save new tokenizer ...
|
369 |
+
After loading new vocabulary: 31061
|
370 |
+
Resize model embedding and save new tokenizer ...
|
371 |
+
Invoke training ...
|
372 |
+
Invoke training ...
|
373 |
+
Invoke training ...
|
374 |
+
[caller] pretrain
|
375 |
+
[caller] pretrain
|
376 |
+
[caller] pretrain
|
377 |
+
[node#0 rank#0] Training on device cuda:0
|
378 |
+
[node#2 rank#4] Training on device cuda:0
|
379 |
+
[node#0 rank#0] 2 x Tesla V100-SXM2-16GB: 0%
|
380 |
+
[node#2 rank#4] 2 x Tesla V100-SXM2-16GB: 0%
|
381 |
+
[node#1 rank#2] Training on device cuda:0
|
382 |
+
[node#1 rank#2] 2 x Tesla V100-SXM2-16GB: 0%
|
383 |
+
[node#0 rank#1] Training on device cuda:1
|
384 |
+
[node#2 rank#5] Training on device cuda:1
|
385 |
+
[node#0 rank#1] 2 x Tesla V100-SXM2-16GB: 2%
|
386 |
+
[node#2 rank#5] 2 x Tesla V100-SXM2-16GB: 2%
|
387 |
+
[node#1 rank#3] Training on device cuda:1
|
388 |
+
[node#1 rank#3] 2 x Tesla V100-SXM2-16GB: 2%
|
389 |
+
[node#2 rank#4] Initialized process group ...
|
390 |
+
[node#0 rank#0] Initialized process group ...
|
391 |
+
[node#1 rank#2] Initialized process group ...
|
392 |
+
[node#0 rank#1] Initialized process group ...
|
393 |
+
[node#2 rank#5] Initialized process group ...
|
394 |
+
[node#1 rank#3] Initialized process group ...
|
395 |
+
[node#0 rank#0] Enter Torch DDP.
|
396 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 0 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
397 |
+
[node#2 rank#4] Enter Torch DDP.
|
398 |
+
[node#0 rank#1] Enter Torch DDP.
|
399 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 1 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
400 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 4 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
401 |
+
[node#1 rank#3] Enter Torch DDP.
|
402 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 3 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
403 |
+
[node#3 rank#7] Enter Torch DDP.
|
404 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 7 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
405 |
+
[node#2 rank#5] Enter Torch DDP.
|
406 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 5 using best-guess GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
407 |
+
[node#3 rank#6] Enter Torch DDP.
|
408 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 6 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
409 |
+
[node#1 rank#2] Enter Torch DDP.
|
410 |
+
[W ProcessGroupNCCL.cpp:1569] Rank 2 using best-guess GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device.
|
411 |
+
AdamW (
|
412 |
+
Parameter Group 0
|
413 |
+
betas: (0.9, 0.999)
|
414 |
+
correct_bias: True
|
415 |
+
eps: 1e-06
|
416 |
+
lr: 1e-06
|
417 |
+
weight_decay: 0.01
|
418 |
+
)
|
419 |
+
AdamW (
|
420 |
+
Parameter Group 0
|
421 |
+
betas: (0.9, 0.999)
|
422 |
+
correct_bias: True
|
423 |
+
eps: 1e-06
|
424 |
+
lr: 1e-06
|
425 |
+
weight_decay: 0.01
|
426 |
+
)
|
427 |
+
AdamW (
|
428 |
+
Parameter Group 0
|
429 |
+
betas: (0.9, 0.999)
|
430 |
+
correct_bias: True
|
431 |
+
eps: 1e-06
|
432 |
+
lr: 1e-06
|
433 |
+
weight_decay: 0.01
|
434 |
+
)
|
435 |
+
AdamW (
|
436 |
+
Parameter Group 0
|
437 |
+
betas: (0.9, 0.999)
|
438 |
+
correct_bias: True
|
439 |
+
eps: 1e-06
|
440 |
+
lr: 1e-06
|
441 |
+
weight_decay: 0.01
|
442 |
+
)
|
443 |
+
AdamW (
|
444 |
+
Parameter Group 0
|
445 |
+
betas: (0.9, 0.999)
|
446 |
+
correct_bias: True
|
447 |
+
eps: 1e-06
|
448 |
+
lr: 1e-06
|
449 |
+
weight_decay: 0.01
|
450 |
+
)
|
451 |
+
[node#3 rank#7] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
452 |
+
[node#3 rank#7] Start training at: (0, 0, -1)
|
453 |
+
AdamW (
|
454 |
+
Parameter Group 0
|
455 |
+
betas: (0.9, 0.999)
|
456 |
+
correct_bias: True
|
457 |
+
eps: 1e-06
|
458 |
+
lr: 1e-06
|
459 |
+
weight_decay: 0.01
|
460 |
+
)
|
461 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
462 |
+
[node#3 rank#6] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
463 |
+
[node#3 rank#6] Start training at: (0, 0, -1)
|
464 |
+
[node#3 rank#7] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
465 |
+
AdamW (
|
466 |
+
Parameter Group 0
|
467 |
+
betas: (0.9, 0.999)
|
468 |
+
correct_bias: True
|
469 |
+
eps: 1e-06
|
470 |
+
lr: 1e-06
|
471 |
+
weight_decay: 0.01
|
472 |
+
)
|
473 |
+
AdamW (
|
474 |
+
Parameter Group 0
|
475 |
+
betas: (0.9, 0.999)
|
476 |
+
correct_bias: True
|
477 |
+
eps: 1e-06
|
478 |
+
lr: 1e-06
|
479 |
+
weight_decay: 0.01
|
480 |
+
)
|
481 |
+
[node#1 rank#2] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
482 |
+
[node#1 rank#2] Start training at: (0, 0, -1)
|
483 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
484 |
+
[node#3 rank#6] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
485 |
+
[node#1 rank#3] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
486 |
+
[node#1 rank#3] Start training at: (0, 0, -1)
|
487 |
+
[node#2 rank#4] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
488 |
+
[node#2 rank#4] Start training at: (0, 0, -1)
|
489 |
+
[node#2 rank#5] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
490 |
+
[node#2 rank#5] Start training at: (0, 0, -1)
|
491 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
492 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
493 |
+
[node#2 rank#4] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
494 |
+
[node#2 rank#5] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
495 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
496 |
+
Loading test data: data.arjmPWtGwzKrkmR/test.txt (bsize=8)
|
497 |
+
[node#1 rank#3] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
498 |
+
[node#1 rank#2] Loading shard data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730 ...
|
499 |
+
[node#0 rank#1] Shards: ['data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.6632730', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7074912', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7517094', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.7959276', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8401458', 'data.arjmPWtGwzKrkmR/mse-aops-2021-data.pkl.pairs.8843640']
|
500 |
+
[node#0 rank#1] Start training at: (0, 0, -1)
|
job-25031358-tail.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e609127390ca34a2bdc307d1f66fc68b056d1550973adbee02979df8e091162e
|
3 |
+
size 442169891
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|