omrikeren commited on
Commit
41265b0
1 Parent(s): 3c9b564

Added files

Browse files
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: he
3
+ tags:
4
+ - roberta
5
+ - language model
6
+ datasets:
7
+ - oscar
8
+ ---
9
+ # TavBERT base model
10
+ A Hebrew BERT-style masked language model operating over characters, pre-trained by masking spans of characters, similarly to SpanBERT (Joshi et al., 2020).
11
+
12
+ ### How to use
13
+
14
+ ```python
15
+ import numpy as np
16
+ import torch
17
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
18
+
19
+ model = AutoModelForMaskedLM.from_pretrained("tau/tavbert-he")
20
+ tokenizer = AutoTokenizer.from_pretrained("tau/tavbert-he")
21
+
22
+ def mask_sentence(sent, span_len=5):
23
+ start_pos = np.random.randint(0, len(sent) - span_len)
24
+ masked_sent = sent[:start_pos] + '[MASK]' * span_len + sent[start_pos + span_len:]
25
+ print("Masked sentence:", masked_sent)
26
+ output = model(**tokenizer.encode_plus(masked_sent,
27
+ return_tensors='pt'))['logits'][0][1:-1]
28
+ preds = [int(x) for x in torch.argmax(torch.softmax(output, axis=1), axis=1)[start_pos:start_pos + span_len]]
29
+ pred_sent = sent[:start_pos] + ''.join(tokenizer.convert_ids_to_tokens(preds)) + sent[start_pos + span_len:]
30
+ print("Model's prediction:", pred_sent)
31
+ ```
32
+ ## Training data
33
+ OSCAR (Ortiz, 2019) Hebrew section (10 GB text, 20 million sentences).
34
+
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 2050,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.6.0.dev0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 345
25
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc4df9b73af0a1806b249f634eb389582f064ae20eba1dadf69f8e2bb2fdc20
3
+ size 350059547
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[],"normalizer":null,"pre_tokenizer":{"type":"Split","pattern":{"String":""},"behavior":"Isolated","invert":false},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}}],"pair":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}},{"Sequence":{"id":"B","type_id":1}},{"SpecialToken":{"id":"[SEP]","type_id":1}}],"special_tokens":{"[CLS]":{"id":"[CLS]","ids":[0],"tokens":["[CLS]"]},"[SEP]":{"id":"[SEP]","ids":[2],"tokens":["[SEP]"]}}},"decoder":null,"model":{"type":"WordLevel","vocab":{"[CLS]":0,"[PAD]":1,"[SEP]":2,"[UNK]":3,"unused0":4,"unused1":5,"unused2":6,"unused3":7,"unused4":8,"unused5":9,"unused6":10,"unused7":11,"unused8":12,"unused9":13,"unused10":14,"unused11":15,"unused12":16,"unused13":17,"unused14":18,"unused15":19,"unused16":20,"unused17":21,"unused18":22,"unused19":23,"unused20":24,"unused21":25,"unused22":26,"unused23":27,"unused24":28,"unused25":29,"unused26":30,"unused27":31,"unused28":32,"unused29":33,"unused30":34,"unused31":35,"unused32":36,"unused33":37,"unused34":38,"unused35":39,"unused36":40,"unused37":41,"unused38":42,"unused39":43,"unused40":44,"unused41":45,"unused42":46,"unused43":47,"unused44":48,"unused45":49,"unused46":50,"unused47":51,"unused48":52,"unused49":53,"unused50":54,"unused51":55,"unused52":56,"unused53":57,"unused54":58,"unused55":59,"unused56":60,"unused57":61,"unused58":62,"unused59":63,"unused60":64,"unused61":65,"unused62":66,"unused63":67,"unused64":68,"unused65":69,"unused66":70,"unused67":71,"unused68":72,"unused69":73,"unused70":74,"unused71":75,"unused72":76,"unused73":77,"unused74":78,"unused75":79,"unused76":80,"unused77":81,"unused78":82,"unused79":83,"unused80":84,"unused81":85,"unused82":86,"unused83":87,"unused84":88,"unused85":89,"unused86":90,"unused87":91,"unused88":92,"unused89":93,"unused90":94,"unused91":95,"unused92":96,"unused93":97,"unused94":98,"unused95":99,"unused96":100,"unused97":101,"unused98":102,"unused99":103," ":104,"!":105,"\"":106,"#":107,"$":108,"%":109,"&":110,"'":111,"(":112,")":113,"*":114,"+":115,",":116,"-":117,".":118,"/":119,"0":120,"1":121,"2":122,"3":123,"4":124,"5":125,"6":126,"7":127,"8":128,"9":129,":":130,";":131,"<":132,"=":133,">":134,"?":135,"@":136,"A":137,"B":138,"C":139,"D":140,"E":141,"F":142,"G":143,"H":144,"I":145,"J":146,"K":147,"L":148,"M":149,"N":150,"O":151,"P":152,"Q":153,"R":154,"S":155,"T":156,"U":157,"V":158,"W":159,"X":160,"Y":161,"Z":162,"[":163,"\\":164,"]":165,"^":166,"_":167,"`":168,"a":169,"b":170,"c":171,"d":172,"e":173,"f":174,"g":175,"h":176,"i":177,"j":178,"k":179,"l":180,"m":181,"n":182,"o":183,"p":184,"q":185,"r":186,"s":187,"t":188,"u":189,"v":190,"w":191,"x":192,"y":193,"z":194,"{":195,"|":196,"}":197,"¢":198,"¨":199,"©":200,"°":201,"³":202,"´":203,"·":204,"»":205,"×":206,"é":207,"А":208,"Б":209,"В":210,"Г":211,"Д":212,"И":213,"К":214,"М":215,"Н":216,"О":217,"П":218,"Р":219,"С":220,"Т":221,"Х":222,"Ш":223,"Э":224,"Я":225,"а":226,"б":227,"в":228,"г":229,"д":230,"е":231,"ж":232,"з":233,"и":234,"й":235,"к":236,"л":237,"м":238,"н":239,"о":240,"п":241,"р":242,"с":243,"т":244,"у":245,"ф":246,"х":247,"ц":248,"ч":249,"ш":250,"щ":251,"ы":252,"ь":253,"э":254,"ю":255,"я":256,"֑":257,"֔":258,"֖":259,"֗":260,"֙":261,"֛":262,"֣":263,"֤":264,"֥":265,"֨":266,"ְ":267,"ֱ":268,"ֲ":269,"ֳ":270,"ִ":271,"ֵ":272,"ֶ":273,"ַ":274,"ָ":275,"ֹ":276,"ֻ":277,"ּ":278,"ֽ":279,"־":280,"ׁ":281,"ׂ":282,"׃":283,"א":284,"ב":285,"ג":286,"ד":287,"ה":288,"ו":289,"ז":290,"ח":291,"ט":292,"י":293,"ך":294,"כ":295,"ל":296,"ם":297,"מ":298,"ן":299,"נ":300,"ס":301,"ע":302,"ף":303,"פ":304,"ץ":305,"צ":306,"ק":307,"ר":308,"ש":309,"ת":310,"׳":311,"״":312,"ا":313,"ب":314,"ة":315,"ت":316,"ح":317,"د":318,"ر":319,"س":320,"ع":321,"ف":322,"ق":323,"ك":324,"ل":325,"م":326,"ن":327,"و":328,"ي":329,"–":330,"—":331,"‘":332,"’":333,"“":334,"”":335,"•":336,"…":337,"₪":338,"€":339,"™":340,"←":341,"■":342,"�":343,"[MASK]":344},"unk_token":"[UNK]"}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"special_tokens_map_file": "./special_tokens_map.json", "name_or_path": ".", "tokenizer_class": "PreTrainedTokenizerFast"}
vocab.txt ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [CLS]
2
+ [PAD]
3
+ [SEP]
4
+ [UNK]
5
+ unused0
6
+ unused1
7
+ unused2
8
+ unused3
9
+ unused4
10
+ unused5
11
+ unused6
12
+ unused7
13
+ unused8
14
+ unused9
15
+ unused10
16
+ unused11
17
+ unused12
18
+ unused13
19
+ unused14
20
+ unused15
21
+ unused16
22
+ unused17
23
+ unused18
24
+ unused19
25
+ unused20
26
+ unused21
27
+ unused22
28
+ unused23
29
+ unused24
30
+ unused25
31
+ unused26
32
+ unused27
33
+ unused28
34
+ unused29
35
+ unused30
36
+ unused31
37
+ unused32
38
+ unused33
39
+ unused34
40
+ unused35
41
+ unused36
42
+ unused37
43
+ unused38
44
+ unused39
45
+ unused40
46
+ unused41
47
+ unused42
48
+ unused43
49
+ unused44
50
+ unused45
51
+ unused46
52
+ unused47
53
+ unused48
54
+ unused49
55
+ unused50
56
+ unused51
57
+ unused52
58
+ unused53
59
+ unused54
60
+ unused55
61
+ unused56
62
+ unused57
63
+ unused58
64
+ unused59
65
+ unused60
66
+ unused61
67
+ unused62
68
+ unused63
69
+ unused64
70
+ unused65
71
+ unused66
72
+ unused67
73
+ unused68
74
+ unused69
75
+ unused70
76
+ unused71
77
+ unused72
78
+ unused73
79
+ unused74
80
+ unused75
81
+ unused76
82
+ unused77
83
+ unused78
84
+ unused79
85
+ unused80
86
+ unused81
87
+ unused82
88
+ unused83
89
+ unused84
90
+ unused85
91
+ unused86
92
+ unused87
93
+ unused88
94
+ unused89
95
+ unused90
96
+ unused91
97
+ unused92
98
+ unused93
99
+ unused94
100
+ unused95
101
+ unused96
102
+ unused97
103
+ unused98
104
+ unused99
105
+
106
+ !
107
+ "
108
+ #
109
+ $
110
+ %
111
+ &
112
+ '
113
+ (
114
+ )
115
+ *
116
+ +
117
+ ,
118
+ -
119
+ .
120
+ /
121
+ 0
122
+ 1
123
+ 2
124
+ 3
125
+ 4
126
+ 5
127
+ 6
128
+ 7
129
+ 8
130
+ 9
131
+ :
132
+ ;
133
+ <
134
+ =
135
+ >
136
+ ?
137
+ @
138
+ A
139
+ B
140
+ C
141
+ D
142
+ E
143
+ F
144
+ G
145
+ H
146
+ I
147
+ J
148
+ K
149
+ L
150
+ M
151
+ N
152
+ O
153
+ P
154
+ Q
155
+ R
156
+ S
157
+ T
158
+ U
159
+ V
160
+ W
161
+ X
162
+ Y
163
+ Z
164
+ [
165
+ \
166
+ ]
167
+ ^
168
+ _
169
+ `
170
+ a
171
+ b
172
+ c
173
+ d
174
+ e
175
+ f
176
+ g
177
+ h
178
+ i
179
+ j
180
+ k
181
+ l
182
+ m
183
+ n
184
+ o
185
+ p
186
+ q
187
+ r
188
+ s
189
+ t
190
+ u
191
+ v
192
+ w
193
+ x
194
+ y
195
+ z
196
+ {
197
+ |
198
+ }
199
+ ¢
200
+ ¨
201
+ ©
202
+ °
203
+ ³
204
+ ´
205
+ ·
206
+ »
207
+ ×
208
+ é
209
+ А
210
+ Б
211
+ В
212
+ Г
213
+ Д
214
+ И
215
+ К
216
+ М
217
+ Н
218
+ О
219
+ П
220
+ Р
221
+ С
222
+ Т
223
+ Х
224
+ Ш
225
+ Э
226
+ Я
227
+ а
228
+ б
229
+ в
230
+ г
231
+ д
232
+ е
233
+ ж
234
+ з
235
+ и
236
+ й
237
+ к
238
+ л
239
+ м
240
+ н
241
+ о
242
+ п
243
+ р
244
+ с
245
+ т
246
+ у
247
+ ф
248
+ х
249
+ ц
250
+ ч
251
+ ш
252
+ щ
253
+ ы
254
+ ь
255
+ э
256
+ ю
257
+ я
258
+ ֑
259
+ ֔
260
+ ֖
261
+ ֗
262
+ ֙
263
+ ֛
264
+ ֣
265
+ ֤
266
+ ֥
267
+ ֨
268
+ ְ
269
+ ֱ
270
+ ֲ
271
+ ֳ
272
+ ִ
273
+ ֵ
274
+ ֶ
275
+ ַ
276
+ ָ
277
+ ֹ
278
+ ֻ
279
+ ּ
280
+ ֽ
281
+ ־
282
+ ׁ
283
+ ׂ
284
+ ׃
285
+ א
286
+ ב
287
+ ג
288
+ ד
289
+ ה
290
+ ו
291
+ ז
292
+ ח
293
+ ט
294
+ י
295
+ ך
296
+ כ
297
+ ל
298
+ ם
299
+ מ
300
+ ן
301
+ נ
302
+ ס
303
+ ע
304
+ ף
305
+ פ
306
+ ץ
307
+ צ
308
+ ק
309
+ ר
310
+ ש
311
+ ת
312
+ ׳
313
+ ״
314
+ ا
315
+ ب
316
+ ة
317
+ ت
318
+ ح
319
+ د
320
+ ر
321
+ س
322
+ ع
323
+ ف
324
+ ق
325
+ ك
326
+ ل
327
+ م
328
+ ن
329
+ و
330
+ ي
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+ [MASK]