Shaltiel commited on
Commit
20254bf
โ€ข
1 Parent(s): 6c3937d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +438 -0
README.md CHANGED
@@ -1,3 +1,441 @@
1
  ---
2
  license: cc-by-4.0
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-4.0
3
+ language:
4
+ - he
5
+ inference: false
6
  ---
7
+ # DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew
8
+
9
+ State-of-the-art language model for Hebrew, released [here](https://arxiv.org/abs/2308.16687).
10
+
11
+ This is the fine-tuned BERT-tiny model for the joint parsing of the following tasks:
12
+
13
+ - Prefix Segmentation
14
+ - Morphological Disabmgiuation
15
+ - Lexicographical Analysis (Lemmatization)
16
+ - Syntactical Parsing (Dependency-Tree)
17
+ - Named-Entity Recognition
18
+
19
+ For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b).
20
+
21
+ Sample usage:
22
+
23
+ ```python
24
+ from transformers import AutoModel, AutoTokenizer
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint')
27
+ model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True)
28
+
29
+ model.eval()
30
+
31
+ sentence = 'ื‘ืฉื ืช 1948 ื”ืฉืœื™ื ืืคืจื™ื ืงื™ืฉื•ืŸ ืืช ืœื™ืžื•ื“ื™ื• ื‘ืคื™ืกื•ืœ ืžืชื›ืช ื•ื‘ืชื•ืœื“ื•ืช ื”ืืžื ื•ืช ื•ื”ื—ืœ ืœืคืจืกื ืžืืžืจื™ื ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื'
32
+ print(model.predict([sentence], tokenizer))
33
+ ```
34
+
35
+ Output:
36
+ ```json
37
+ [
38
+ {
39
+ "text": "ื‘ืฉื ืช 1948 ื”ืฉืœื™ื ืืคืจื™ื ืงื™ืฉื•ืŸ ืืช ืœื™ืžื•ื“ื™ื• ื‘ืคื™ืกื•ืœ ืžืชื›ืช ื•ื‘ืชื•ืœื“ื•ืช ื”ืืžื ื•ืช ื•ื”ื—ืœ ืœืคืจืกื ืžืืžืจื™ื ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื",
40
+ "tokens": [
41
+ {
42
+ "token": "ื‘ืฉื ืช",
43
+ "syntax": {
44
+ "word": "ื‘ืฉื ืช",
45
+ "dep_head_idx": 2,
46
+ "dep_func": "obl",
47
+ "dep_head": "ื”ืฉืœื™ื"
48
+ },
49
+ "seg": [
50
+ "ื‘",
51
+ "ืฉื ืช"
52
+ ],
53
+ "lex": "ืฉื ื”",
54
+ "morph": {
55
+ "token": "ื‘ืฉื ืช",
56
+ "pos": "NOUN",
57
+ "feats": {
58
+ "Gender": "Fem",
59
+ "Number": "Sing"
60
+ },
61
+ "prefixes": [
62
+ "ADP"
63
+ ],
64
+ "suffix": false
65
+ }
66
+ },
67
+ {
68
+ "token": "1948",
69
+ "syntax": {
70
+ "word": "1948",
71
+ "dep_head_idx": 0,
72
+ "dep_func": "compound",
73
+ "dep_head": "ื‘ืฉื ืช"
74
+ },
75
+ "seg": [
76
+ "1948"
77
+ ],
78
+ "lex": "1948",
79
+ "morph": {
80
+ "token": "1948",
81
+ "pos": "NUM",
82
+ "feats": {},
83
+ "prefixes": [],
84
+ "suffix": false
85
+ }
86
+ },
87
+ {
88
+ "token": "ื”ืฉืœื™ื",
89
+ "syntax": {
90
+ "word": "ื”ืฉืœื™ื",
91
+ "dep_head_idx": -1,
92
+ "dep_func": "root",
93
+ "dep_head": "ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื"
94
+ },
95
+ "seg": [
96
+ "ื”ืฉืœื™ื"
97
+ ],
98
+ "lex": "ื”ืฉืœื™ื",
99
+ "morph": {
100
+ "token": "ื”ืฉืœื™ื",
101
+ "pos": "VERB",
102
+ "feats": {
103
+ "Gender": "Masc",
104
+ "Number": "Sing",
105
+ "Person": "3",
106
+ "Tense": "Past"
107
+ },
108
+ "prefixes": [],
109
+ "suffix": false
110
+ }
111
+ },
112
+ {
113
+ "token": "ืืคืจื™ื",
114
+ "syntax": {
115
+ "word": "ืืคืจื™ื",
116
+ "dep_head_idx": 2,
117
+ "dep_func": "nsubj",
118
+ "dep_head": "ื”ืฉืœื™ื"
119
+ },
120
+ "seg": [
121
+ "ืืคืจื™ื"
122
+ ],
123
+ "lex": "ืืคืจื™ื",
124
+ "morph": {
125
+ "token": "ืืคืจื™ื",
126
+ "pos": "PROPN",
127
+ "feats": {},
128
+ "prefixes": [],
129
+ "suffix": false
130
+ }
131
+ },
132
+ {
133
+ "token": "ืงื™ืฉื•ืŸ",
134
+ "syntax": {
135
+ "word": "ืงื™ืฉื•ืŸ",
136
+ "dep_head_idx": 3,
137
+ "dep_func": "flat",
138
+ "dep_head": "ืืคืจื™ื"
139
+ },
140
+ "seg": [
141
+ "ืงื™ืฉื•ืŸ"
142
+ ],
143
+ "lex": "ืงื™ืฉื•ืŸ",
144
+ "morph": {
145
+ "token": "ืงื™ืฉื•ืŸ",
146
+ "pos": "PROPN",
147
+ "feats": {},
148
+ "prefixes": [],
149
+ "suffix": false
150
+ }
151
+ },
152
+ {
153
+ "token": "ืืช",
154
+ "syntax": {
155
+ "word": "ืืช",
156
+ "dep_head_idx": 6,
157
+ "dep_func": "case",
158
+ "dep_head": "ืœื™ืžื•ื“ื™ื•"
159
+ },
160
+ "seg": [
161
+ "ืืช"
162
+ ],
163
+ "lex": "ืืช",
164
+ "morph": {
165
+ "token": "ืืช",
166
+ "pos": "ADP",
167
+ "feats": {},
168
+ "prefixes": [],
169
+ "suffix": false
170
+ }
171
+ },
172
+ {
173
+ "token": "ืœื™ืžื•ื“ื™ื•",
174
+ "syntax": {
175
+ "word": "ืœื™ืžื•ื“ื™ื•",
176
+ "dep_head_idx": 2,
177
+ "dep_func": "obj",
178
+ "dep_head": "ื”ืฉืœื™ื"
179
+ },
180
+ "seg": [
181
+ "ืœื™ืžื•ื“ื™ื•"
182
+ ],
183
+ "lex": "ืœื™ืžื•ื“",
184
+ "morph": {
185
+ "token": "ืœื™ืžื•ื“ื™ื•",
186
+ "pos": "NOUN",
187
+ "feats": {
188
+ "Gender": "Masc",
189
+ "Number": "Plur"
190
+ },
191
+ "prefixes": [],
192
+ "suffix": "PRON",
193
+ "suffix_feats": {
194
+ "Gender": "Masc",
195
+ "Number": "Sing",
196
+ "Person": "3"
197
+ }
198
+ }
199
+ },
200
+ {
201
+ "token": "ื‘ืคื™ืกื•ืœ",
202
+ "syntax": {
203
+ "word": "ื‘ืคื™ืกื•ืœ",
204
+ "dep_head_idx": 6,
205
+ "dep_func": "nmod",
206
+ "dep_head": "ืœื™ืžื•ื“ื™ื•"
207
+ },
208
+ "seg": [
209
+ "ื‘",
210
+ "ืคื™ืกื•ืœ"
211
+ ],
212
+ "lex": "ืคื™ืกื•ืœ",
213
+ "morph": {
214
+ "token": "ื‘ืคื™ืกื•ืœ",
215
+ "pos": "NOUN",
216
+ "feats": {
217
+ "Gender": "Masc",
218
+ "Number": "Sing"
219
+ },
220
+ "prefixes": [
221
+ "ADP"
222
+ ],
223
+ "suffix": false
224
+ }
225
+ },
226
+ {
227
+ "token": "ืžืชื›ืช",
228
+ "syntax": {
229
+ "word": "ืžืชื›ืช",
230
+ "dep_head_idx": 7,
231
+ "dep_func": "compound",
232
+ "dep_head": "ื‘ืคื™ืกื•ืœ"
233
+ },
234
+ "seg": [
235
+ "ืžืชื›ืช"
236
+ ],
237
+ "lex": "ืžืชื›ืช",
238
+ "morph": {
239
+ "token": "ืžืชื›ืช",
240
+ "pos": "NOUN",
241
+ "feats": {
242
+ "Gender": "Fem",
243
+ "Number": "Sing"
244
+ },
245
+ "prefixes": [],
246
+ "suffix": false
247
+ }
248
+ },
249
+ {
250
+ "token": "ื•ื‘ืชื•ืœื“ื•ืช",
251
+ "syntax": {
252
+ "word": "ื•ื‘ืชื•ืœื“ื•ืช",
253
+ "dep_head_idx": 7,
254
+ "dep_func": "conj",
255
+ "dep_head": "ื‘ืคื™ืกื•ืœ"
256
+ },
257
+ "seg": [
258
+ "ื•ื‘",
259
+ "ืชื•ืœื“ื•ืช"
260
+ ],
261
+ "lex": "ืชื•ืœื“ื”",
262
+ "morph": {
263
+ "token": "ื•ื‘ืชื•ืœื“ื•ืช",
264
+ "pos": "NOUN",
265
+ "feats": {
266
+ "Gender": "Fem",
267
+ "Number": "Plur"
268
+ },
269
+ "prefixes": [
270
+ "CCONJ",
271
+ "ADP"
272
+ ],
273
+ "suffix": false
274
+ }
275
+ },
276
+ {
277
+ "token": "ื”ืืžื ื•ืช",
278
+ "syntax": {
279
+ "word": "ื”ืืžื ื•ืช",
280
+ "dep_head_idx": 9,
281
+ "dep_func": "compound",
282
+ "dep_head": "ื•ื‘ืชื•ืœื“ื•ืช"
283
+ },
284
+ "seg": [
285
+ "ื”",
286
+ "ืืžื ื•ืช"
287
+ ],
288
+ "lex": "ืื•ืžื ื•ืช",
289
+ "morph": {
290
+ "token": "ื”ืืžื ื•ืช",
291
+ "pos": "NOUN",
292
+ "feats": {
293
+ "Gender": "Fem",
294
+ "Number": "Sing"
295
+ },
296
+ "prefixes": [
297
+ "DET"
298
+ ],
299
+ "suffix": false
300
+ }
301
+ },
302
+ {
303
+ "token": "ื•ื”ื—ืœ",
304
+ "syntax": {
305
+ "word": "ื•ื”ื—ืœ",
306
+ "dep_head_idx": 2,
307
+ "dep_func": "conj",
308
+ "dep_head": "ื”ืฉืœื™ื"
309
+ },
310
+ "seg": [
311
+ "ื•",
312
+ "ื”ื—ืœ"
313
+ ],
314
+ "lex": "ื”ื—ืœ",
315
+ "morph": {
316
+ "token": "ื•ื”ื—ืœ",
317
+ "pos": "VERB",
318
+ "feats": {
319
+ "Gender": "Masc",
320
+ "Number": "Sing",
321
+ "Person": "3",
322
+ "Tense": "Past"
323
+ },
324
+ "prefixes": [
325
+ "CCONJ"
326
+ ],
327
+ "suffix": false
328
+ }
329
+ },
330
+ {
331
+ "token": "ืœืคืจืกื",
332
+ "syntax": {
333
+ "word": "ืœืคืจืกื",
334
+ "dep_head_idx": 11,
335
+ "dep_func": "xcomp",
336
+ "dep_head": "ื•ื”ื—ืœ"
337
+ },
338
+ "seg": [
339
+ "ืœืคืจืกื"
340
+ ],
341
+ "lex": "ืคืจืกื",
342
+ "morph": {
343
+ "token": "ืœืคืจืกื",
344
+ "pos": "VERB",
345
+ "feats": {},
346
+ "prefixes": [],
347
+ "suffix": false
348
+ }
349
+ },
350
+ {
351
+ "token": "ืžืืžืจื™ื",
352
+ "syntax": {
353
+ "word": "ืžืืžืจื™ื",
354
+ "dep_head_idx": 12,
355
+ "dep_func": "obj",
356
+ "dep_head": "ืœืคืจืกื"
357
+ },
358
+ "seg": [
359
+ "ืžืืžืจื™ื"
360
+ ],
361
+ "lex": "ืžืืžืจ",
362
+ "morph": {
363
+ "token": "ืžืืžืจื™ื",
364
+ "pos": "NOUN",
365
+ "feats": {
366
+ "Gender": "Masc",
367
+ "Number": "Plur"
368
+ },
369
+ "prefixes": [],
370
+ "suffix": false
371
+ }
372
+ },
373
+ {
374
+ "token": "ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื",
375
+ "syntax": {
376
+ "word": "ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื",
377
+ "dep_head_idx": 13,
378
+ "dep_func": "amod",
379
+ "dep_head": "ืžืืžืจื™ื"
380
+ },
381
+ "seg": [
382
+ "ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื"
383
+ ],
384
+ "lex": "ื”ื•ืžื•ืจื™ืกื˜ื™",
385
+ "morph": {
386
+ "token": "ื”ื•ืžื•ืจื™ืกื˜ื™ื™ื",
387
+ "pos": "ADJ",
388
+ "feats": {
389
+ "Gender": "Masc",
390
+ "Number": "Plur"
391
+ },
392
+ "prefixes": [],
393
+ "suffix": false
394
+ }
395
+ }
396
+ ],
397
+ "root_idx": 2,
398
+ "ner_entities": [
399
+ {
400
+ "phrase": "1948",
401
+ "label": "TIMEX"
402
+ },
403
+ {
404
+ "phrase": "ืืคืจื™ื ืงื™ืฉื•ืŸ",
405
+ "label": "PER"
406
+ }
407
+ ]
408
+ }
409
+ ]
410
+ ```
411
+
412
+
413
+ ## Citation
414
+
415
+ If you use DictaBERT in your research, please cite ```DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew```
416
+
417
+ **BibTeX:**
418
+
419
+ ```bibtex
420
+ @misc{shmidman2023dictabert,
421
+ title={DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew},
422
+ author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel},
423
+ year={2023},
424
+ eprint={2308.16687},
425
+ archivePrefix={arXiv},
426
+ primaryClass={cs.CL}
427
+ }
428
+ ```
429
+
430
+ ## License
431
+
432
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
433
+
434
+ This work is licensed under a
435
+ [Creative Commons Attribution 4.0 International License][cc-by].
436
+
437
+ [![CC BY 4.0][cc-by-image]][cc-by]
438
+
439
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
440
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
441
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg