mehran commited on
Commit
be62ffe
1 Parent(s): dfd4faa

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +454 -20
model.py CHANGED
@@ -1,42 +1,464 @@
1
  import os
2
  import kenlm
3
  import sentencepiece as spm
4
- from tokenizers import normalizers
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  class KenlmModel:
8
  def __init__(
9
  self,
10
  vocabulary_size: str,
11
  ngram: str,
12
  pruning: str,
 
13
  normalize_nfd: bool = True,
14
  normalize_numbers: bool = True,
15
- normalize_puctuation: bool = True,
 
16
  ):
17
  self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing"))
18
  self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model"))
19
 
20
  norm_list = []
21
- if normalize_numbers:
22
- norm_list += [normalizers.Replace("۱", "۰"),
23
- normalizers.Replace("۲", "۰"),
24
- normalizers.Replace("۳", "۰"),
25
- normalizers.Replace("۴", "۰"),
26
- normalizers.Replace("۵", "۰"),
27
- normalizers.Replace("۶", "۰"),
28
- normalizers.Replace("۷", "۰"),
29
- normalizers.Replace("۸", "۰"),
30
- normalizers.Replace("۹", "۰"),
31
- normalizers.Replace(".", "")]
32
- if normalize_puctuation:
33
- norm_list += [normalizers.Replace(".", ""),
34
- normalizers.Replace("!", ""),
35
- normalizers.Replace("؛", ""),
36
- normalizers.Replace("،", ""),
37
- normalizers.Replace("؟", "")]
38
  if normalize_nfd:
39
  norm_list += [normalizers.NFD()]
 
 
 
 
 
 
 
 
 
 
 
40
  norm_list += [normalizers.Strip()]
41
 
42
  self.normalizer = normalizers.Sequence(norm_list)
@@ -47,8 +469,20 @@ class KenlmModel:
47
  vocabulary_size: str,
48
  ngram: str,
49
  pruning: str,
 
 
 
 
 
50
  ):
51
- return cls(vocabulary_size, ngram, pruning)
 
 
 
 
 
 
 
52
 
53
  def score(self, doc: str):
54
  doc = self.normalizer.normalize_str(doc)
 
1
  import os
2
  import kenlm
3
  import sentencepiece as spm
4
+ from tokenizers import normalizers, Regex
5
 
6
 
7
+ # Borrowed from Jomleh dataset code
8
+ char_map = {
9
+ # Arabic Letter Hamza
10
+ # "\u": "\u0621",
11
+
12
+ # Arabic Letter Alef with Hamza Above
13
+ "\uFE83": "\u0623",
14
+ "\uFE84": "\u0623",
15
+
16
+ # Arabic Letter Yeh with Hamza Above
17
+ "\uFE89": "\u0626",
18
+ "\uFE8A": "\u0626",
19
+ "\uFE8B": "\u0626",
20
+ "\uFE8C": "\u0626",
21
+
22
+ # Arabic Letter Waw with Hamza Above
23
+ "\uFE85": "\u0624",
24
+ "\uFE86": "\u0624",
25
+ "\u0676": "\u0624",
26
+
27
+ # Arabic Letter Alef with Madda Above
28
+ "\uFE81": "\u0622", # Arabic letter Alef final form
29
+ "\uFE82": "\u0622", # Arabic letter Alef isolated form
30
+
31
+ # Alef
32
+ "\uFB50": "\u0627", # Arabic letter Alef wasla
33
+ "\uFE87": "\u0627",
34
+ "\u0675": "\u0627",
35
+ "\u0625": "\u0627",
36
+ "\uFE8D": "\u0627",
37
+ "\uFE8E": "\u0627",
38
+ "\u1EE00": "\u0627",
39
+ "\u1EE80": "\u0627",
40
+
41
+ # Beh
42
+ "\uFE8F": "\u0628",
43
+ "\uFE90": "\u0628",
44
+ "\uFE91": "\u0628",
45
+ "\uFE92": "\u0628",
46
+ "\u1EE01": "\u0628",
47
+ "\u1EE21": "\u0628",
48
+ "\u1EE61": "\u0628",
49
+ "\u1EE81": "\u0628",
50
+ "\u1EEA1": "\u0628",
51
+
52
+ # Pe
53
+ "\uFB56": "\u067E",
54
+ "\uFB57": "\u067E",
55
+ "\uFB58": "\u067E",
56
+ "\uFB59": "\u067E",
57
+
58
+ # Teh
59
+ "\uFE95": "\u062A",
60
+ "\uFE96": "\u062A",
61
+ "\uFE97": "\u062A",
62
+ "\uFE98": "\u062A",
63
+ "\u1EE15": "\u062A",
64
+ "\u1EE35": "\u062A",
65
+ "\u1EE75": "\u062A",
66
+ "\u1EE95": "\u062A",
67
+ "\u1EEB5": "\u062A",
68
+
69
+ # Theh
70
+ "\uFE99": "\u062B",
71
+ "\uFE9A": "\u062B",
72
+ "\uFE9B": "\u062B",
73
+ "\uFE9C": "\u062B",
74
+ "\u1EE16": "\u062B",
75
+ "\u1EE36": "\u062B",
76
+ "\u1EE76": "\u062B",
77
+ "\u1EE96": "\u062B",
78
+ "\u1EEB6": "\u062B",
79
+
80
+ # Jim
81
+ "\uFE9D": "\u062C",
82
+ "\uFE9E": "\u062C",
83
+ "\uFE9F": "\u062C",
84
+ "\uFEA0": "\u062C",
85
+ "\u1EE02": "\u062C",
86
+ "\u1EE22": "\u062C",
87
+ "\u1EE42": "\u062C",
88
+ "\u1EE62": "\u062C",
89
+ "\u1EE82": "\u062C",
90
+ "\u1EEA2": "\u062C",
91
+
92
+ # Cheh
93
+ "\uFB7A": "\u0686",
94
+ "\uFB7B": "\u0686",
95
+ "\uFB7C": "\u0686",
96
+ "\uFB7D": "\u0686",
97
+
98
+ # Hah
99
+ "\uFEA1": "\u062D",
100
+ "\uFEA2": "\u062D",
101
+ "\uFEA3": "\u062D",
102
+ "\uFEA4": "\u062D",
103
+ "\u1EE07": "\u062D",
104
+ "\u1EE27": "\u062D",
105
+ "\u1EE47": "\u062D",
106
+ "\u1EE67": "\u062D",
107
+ "\u1EE87": "\u062D",
108
+ "\u1EEA7": "\u062D",
109
+
110
+ # Khah
111
+ "\uFEA5": "\u062E",
112
+ "\uFEA6": "\u062E",
113
+ "\uFEA7": "\u062E",
114
+ "\uFEA8": "\u062E",
115
+ "\u1EE17": "\u062E",
116
+ "\u1EE37": "\u062E",
117
+ "\u1EE57": "\u062E",
118
+ "\u1EE77": "\u062E",
119
+ "\u1EE97": "\u062E",
120
+ "\u1EEB7": "\u062E",
121
+
122
+ # Dal
123
+ "\uFEA9": "\u062F",
124
+ "\uFEAA": "\u062F",
125
+ "\u1EE03": "\u062F",
126
+ "\u1EE83": "\u062F",
127
+ "\u1EEA3": "\u062F",
128
+
129
+ # Zal
130
+ "\uFEAB": "\u0630",
131
+ "\uFEAC": "\u0630",
132
+ "\u1EE18": "\u0630",
133
+ "\u1EE98": "\u0630",
134
+ "\u1EEB8": "\u0630",
135
+
136
+ # Reh
137
+ "\uFEAE": "\u0631", # Arabic letter Reh isolated form
138
+ "\uFEAD": "\u0631", # Arabic letter Reh final form
139
+ "\u0692": "\u0631",
140
+ "\u1EE13": "\u0631",
141
+ "\u1EE93": "\u0631",
142
+ "\u1EEB3": "\u0631",
143
+
144
+ # Ze
145
+ "\uFEAF": "\u0632", #
146
+ "\uFEB0": "\u0632", #
147
+ "\u1EE06": "\u0632", #
148
+ "\u1EE86": "\u0632", #
149
+ "\u1EEA6": "\u0632", #
150
+
151
+ # Jhe
152
+ "\uFB8A": "\u0698",
153
+ "\uFB8B": "\u0698",
154
+
155
+ # Seen
156
+ "\uFEB1": "\u0633", #
157
+ "\uFEB2": "\u0633", #
158
+ "\uFEB3": "\u0633", #
159
+ "\uFEB4": "\u0633", #
160
+ "\u1EE0E": "\u0633", #
161
+ "\u1EE2E": "\u0633", #
162
+ "\u1EE4E": "\u0633", #
163
+ "\u1EE6E": "\u0633", #
164
+ "\u1EE8E": "\u0633", #
165
+ "\u1EEAE": "\u0633", #
166
+
167
+ # Sheen
168
+ "\uFEB5": "\u0634", #
169
+ "\uFEB6": "\u0634", #
170
+ "\uFEB7": "\u0634", #
171
+ "\uFEB8": "\u0634", #
172
+ "\u1EE14": "\u0634", #
173
+ "\u1EE34": "\u0634", #
174
+ "\u1EE54": "\u0634", #
175
+ "\u1EE74": "\u0634", #
176
+ "\u1EE94": "\u0634", #
177
+ "\u1EEB4": "\u0634", #
178
+
179
+ # Sad
180
+ "\uFEB9": "\u0635", #
181
+ "\uFEBA": "\u0635", #
182
+ "\uFEBB": "\u0635", #
183
+ "\uFEBC": "\u0635", #
184
+ "\u1EE11": "\u0635", #
185
+ "\u1EE31": "\u0635", #
186
+ "\u1EE51": "\u0635", #
187
+ "\u1EE71": "\u0635", #
188
+ "\u1EE91": "\u0635", #
189
+ "\u1EEB1": "\u0635", #
190
+
191
+ # Zad
192
+ "\uFEBD": "\u0636", #
193
+ "\uFEBE": "\u0636", #
194
+ "\uFEBF": "\u0636", #
195
+ "\uFEC0": "\u0636", #
196
+ "\u1EE19": "\u0636", #
197
+ "\u1EE39": "\u0636", #
198
+ "\u1EE59": "\u0636", #
199
+ "\u1EE79": "\u0636", #
200
+ "\u1EE99": "\u0636", #
201
+ "\u1EEB9": "\u0636", #
202
+
203
+ # Ta
204
+ "\uFEC1": "\u0637", #
205
+ "\uFEC2": "\u0637", #
206
+ "\uFEC3": "\u0637", #
207
+ "\uFEC4": "\u0637", #
208
+ "\u1EE08": "\u0637", #
209
+ "\u1EE68": "\u0637", #
210
+ "\u1EE88": "\u0637", #
211
+ "\u1EEA8": "\u0637", #
212
+
213
+ # Za
214
+ "\uFEC5": "\u0638", #
215
+ "\uFEC6": "\u0638", #
216
+ "\uFEC7": "\u0638", #
217
+ "\uFEC8": "\u0638", #
218
+ "\u1EE1A": "\u0638", #
219
+ "\u1EE7A": "\u0638", #
220
+ "\u1EE9A": "\u0638", #
221
+ "\u1EEBA": "\u0638", #
222
+
223
+ # Ain
224
+ "\uFEC9": "\u0639", #
225
+ "\uFECA": "\u0639", #
226
+ "\uFECB": "\u0639", #
227
+ "\uFECC": "\u0639", #
228
+ "\u1EE0F": "\u0639", #
229
+ "\u1EE2F": "\u0639", #
230
+ "\u1EE4F": "\u0639", #
231
+ "\u1EE6F": "\u0639", #
232
+ "\u1EE8F": "\u0639", #
233
+ "\u1EEAF": "\u0639", #
234
+
235
+ # Ghain
236
+ "\uFECD": "\u063A", #
237
+ "\uFECE": "\u063A", #
238
+ "\uFECF": "\u063A", #
239
+ "\uFED0": "\u063A", #
240
+ "\u1EE1B": "\u063A", #
241
+ "\u1EE3B": "\u063A", #
242
+ "\u1EE5B": "\u063A", #
243
+ "\u1EE7B": "\u063A", #
244
+ "\u1EE9B": "\u063A", #
245
+ "\u1EEBB": "\u063A", #
246
+
247
+ # Fa
248
+ "\uFED1": "\u0641", #
249
+ "\uFED2": "\u0641", #
250
+ "\uFED3": "\u0641", #
251
+ "\uFED4": "\u0641", #
252
+ "\u1EE10": "\u0641", #
253
+ "\u1EE30": "\u0641", #
254
+ "\u1EE70": "\u0641", #
255
+ "\u1EE90": "\u0641", #
256
+ "\u1EEB0": "\u0641", #
257
+
258
+ # Qaf
259
+ "\uFED5": "\u0642", #
260
+ "\uFED6": "\u0642", #
261
+ "\uFED7": "\u0642", #
262
+ "\uFED8": "\u0642", #
263
+ "\u1EE12": "\u0642", #
264
+ "\u1EE32": "\u0642", #
265
+ "\u1EE52": "\u0642", #
266
+ "\u1EE72": "\u0642", #
267
+ "\u1EE92": "\u0642", #
268
+ "\u1EEB2": "\u0642", #
269
+
270
+ # Kaf
271
+ "\uFB8E": "\u06A9", # Arabic letter Kaf isolated form
272
+ "\uFB8F": "\u06A9", # Arabic letter Kaf final form
273
+ "\uFB90": "\u06A9", # Arabic letter Kaf initial form
274
+ "\uFB91": "\u06A9", # Arabic letter Kaf medial form
275
+ "\uFCC8": "\u06A9", # Arabic ligature Dal with Alef final form
276
+ "\u0643": "\u06A9",
277
+ "\uFED9": "\u06A9",
278
+ "\uFEDA": "\u06A9", # Arabic Letter Kaf Final Form
279
+ "\uFEDB": "\u06A9", #
280
+ "\uFEDC": "\u06A9", #
281
+ "\u1EE0A": "\u06A9", #
282
+ "\u1EE2A": "\u06A9", #
283
+ "\u1EE6A": "\u06A9", #
284
+
285
+ # Gaf
286
+ "\uFB92": "\u06AF", # Arabic letter Gaf isolated form
287
+ "\uFB93": "\u06AF", # Arabic letter Gaf final form
288
+ "\uFB94": "\u06AF", # Arabic letter Gaf initial form
289
+ "\uFB95": "\u06AF", # Arabic letter Gaf medial form
290
+
291
+ # Lam
292
+ "\uFCC9": "\u0644", # Arabic Ligature Lam with Jeem Initial Form
293
+ "\uFEDD": "\u0644", # Arabic Letter Lam Isolated Form
294
+ "\uFEDE": "\u0644", # Arabic Letter Lam Final Form
295
+ "\uFEDF": "\u0644", # Arabic Letter Lam Initial Form
296
+ "\uFEE0": "\u0644", # Arabic Letter Lam Medial Form
297
+ "\u1EE0B": "\u0644", # Arabic Mathematical Lam
298
+ "\u1EE2B": "\u0644", # Arabic Mathematical Initial Lam
299
+ "\u1EE4B": "\u0644", # Arabic Mathematical Tailed Lam
300
+ "\u1EE8B": "\u0644", # Arabic Mathematical Looped Lam
301
+ "\u1EEAB": "\u0644", # Arabic Mathematical Double-Struck Lam
302
+
303
+ # Mim
304
+ "\uFEE1": "\u0645", # Arabic Letter Meem Isolated Form
305
+ "\uFEE2": "\u0645", # Arabic Letter Meem Final Form
306
+ "\uFEE3": "\u0645", # Arabic Letter Meem Initial Form
307
+ "\uFEE4": "\u0645", # Arabic Letter Meem Medial Form
308
+ "\u1EE0C": "\u0645", # Arabic Mathematical Meem
309
+ "\u1EE2C": "\u0645", # Arabic Mathematical Initial Meem
310
+ "\u1EE6C": "\u0645", # Arabic Mathematical Stretched Meem
311
+ "\u1EE8C": "\u0645", # Arabic Mathematical Looped Meem
312
+ "\u1EEAC": "\u0645", # Arabic Mathematical Double-Struck Meem
313
+
314
+ # Nun
315
+ "\uFEE5": "\u0646", # Arabic Letter Noon Isolated Form
316
+ "\uFEE6": "\u0646", # Arabic Letter Noon Final Form
317
+ "\uFEE7": "\u0646", # Arabic Letter Noon Initial Form
318
+ "\uFEE8": "\u0646", # Arabic Letter Noon Medial Form
319
+ "\u1EE0D": "\u0646", # Arabic Mathematical Noon
320
+ "\u1EE2D": "\u0646", # Arabic Mathematical Initial Noon
321
+ "\u1EE4D": "\u0646", # Arabic Mathematical Tailed Noon
322
+ "\u1EE6D": "\u0646", # Arabic Mathematical Stretched Noon
323
+ "\u1EE8D": "\u0646", # Arabic Mathematical Looped Noon
324
+ "\u1EEAD": "\u0646", # Arabic Mathematical Double-Struck Noon
325
+
326
+ # Vav
327
+ "\u0677": "\u0648", # Arabic letter Mid hamza on waw
328
+ "\uFEED": "\u0648", # Arabic Letter Waw Isolated Form
329
+ "\uFEEE": "\u0648", # Arabic Letter Waw Final Form
330
+ "\u06C6": "\u0648", # Arabic Letter Oe
331
+ "\u06C7": "\u0648", # Arabic Letter U
332
+
333
+ # He
334
+ "\u06C0": "\u0647", # Arabic letter Heh with yeh above
335
+ "\u0629": "\u0647", # Arabic Letter Teh Marbuta
336
+ "\u06BE": "\u0647", # Arabic Letter Heh Doachashmee
337
+ "\uFE93": "\u0647", # Arabic Letter Teh Marbuta Isolated Form
338
+ "\u06D5": "\u0647", # Arabic Letter Ae
339
+ "\uFEE9": "\u0647", # Arabic Letter Heh Isolated Form
340
+ "\uFEEA": "\u0647", # Arabic Letter Heh Final Form
341
+ "\uFEEB": "\u0647", # Arabic Letter Heh Initial Form
342
+ "\uFEEC": "\u0647", # Arabic Letter Heh Medial Form
343
+ "\u1EE24": "\u0647", # Arabic Mathematical Initial Heh
344
+ "\u1EE64": "\u0647", # Arabic Mathematical Stretched Heh
345
+ "\u1EE84": "\u0647", # Arabic Mathematical Looped Heh
346
+
347
+ # Yeh
348
+ "\u06D0": "\u06CC", # Arabic letter Yeh with dot below
349
+ "\uFEEF": "\u06CC", # Arabic Letter Alef Maksura Isolated Form
350
+ "\uFEF3": "\u06CC", # Arabic Letter Yeh Initial Form
351
+ "\uFEF4": "\u06CC", # Arabic Letter Yeh Medial Form
352
+ "\u064A": "\u06CC", # Arabic Letter Yeh
353
+ "\uFEF1": "\u06CC", # Arabic Letter Yeh Isolated Form
354
+ "\u06CE": "\u06CC", # Arabic Letter Yeh with Small V
355
+ "\uFBFD": "\u06CC", # Arabic Letter Farsi Yeh Final Form
356
+ "\uFBFC": "\u06CC", # Arabic Letter Farsi Yeh Isolated Form
357
+ "\uFBFE": "\u06CC", # Arabic Letter Farsi Yeh Initial Form
358
+ "\uFBFF": "\u06CC", # Arabic Letter Farsi Yeh Medial Form
359
+ "\uFEF0": "\u06CC", # Arabic letter Lam final form
360
+ "\uFEF2": "\u06CC", # Arabic letter Lam medial form
361
+ "\u063D": "\u06CC",
362
+ "\u063E": "\u06CC",
363
+ "\u063F": "\u06CC",
364
+ "\u06D2": "\u06CC", # Arabic Letter Yeh Barree
365
+
366
+ "\u064E": "",
367
+ "\u064B": "",
368
+ "\u064F": "",
369
+ "\u064C": "",
370
+ "\u0650": "",
371
+ "\u064D": "",
372
+ "\u0652": "",
373
+ "\u0651": "",
374
+ "\u0654": "",
375
+
376
+ "0": "۰",
377
+ "1": "۱",
378
+ "2": "۲",
379
+ "3": "۳",
380
+ "4": "۴",
381
+ "5": "۵",
382
+ "6": "۶",
383
+ "7": "۷",
384
+ "8": "۸",
385
+ "9": "۹",
386
+ "٠": "۰",
387
+ "١": "۱",
388
+ "٢": "۲",
389
+ "٣": "۳",
390
+ "٤": "۴",
391
+ "٥": "۵",
392
+ "٦": "۶",
393
+ "٧": "۷",
394
+ "٨": "۸",
395
+ "٩": "۹",
396
+
397
+ "٬": "،",
398
+ ",": "،",
399
+ ";": "؛",
400
+ "?": "؟",
401
+ "\\": " ",
402
+ "…": " غیره ",
403
+ "%": " درصد ",
404
+ "\u200e": " ",# LEFT-TO-RIGHT
405
+ "\u200f": " ",# RIGHT-TO-LEFT
406
+ "\u202a": " ",# LEFT-TO-RIGHT EMBEDDING
407
+ "\u202b": " ",# RIGHT-TO-LEFT EMBEDDING
408
+ "\u2066": " ",# LEFT-TO-RIGHT ISOLATE
409
+ "\u2067": " ",# RIGHT-TO-LEFT ISOLATE
410
+ "\u2069": " ",# POP DIRECTIONAL ISOLATE
411
+ "\ufdef": " ",# Non-standard
412
+ "\u00B7": ".",# MIDDLE DOT
413
+ "\u2022": " ",# BULLET POINT
414
+
415
+ "'": " ",
416
+ "“": " ",
417
+ "”": " ",
418
+ "\u00ad": " ",
419
+ "\u005f": " ",
420
+ "\u002b": " ",
421
+ "\u200b": " ",
422
+ # ©
423
+ "\u00a9": " ",
424
+
425
+ "\u2014": " ",# Em Dash
426
+ "\u2019": " ",# Right Single Quotation Mark
427
+ "\uFE0F": "",# Variation Selector-16 (VS16)
428
+ "\u007C": " ",# Vertical Line
429
+ }
430
+
431
  class KenlmModel:
432
  def __init__(
433
  self,
434
  vocabulary_size: str,
435
  ngram: str,
436
  pruning: str,
437
+ map_to_farsi_alphabet: bool = True,
438
  normalize_nfd: bool = True,
439
  normalize_numbers: bool = True,
440
+ remove_puctuation: bool = True,
441
+ remove_non_farsi: bool = True,
442
  ):
443
  self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing"))
444
  self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model"))
445
 
446
  norm_list = []
447
+ if map_to_farsi_alphabet:
448
+ norm_list += [normalizers.Replace(key, value) for key, value in char_map.items()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  if normalize_nfd:
450
  norm_list += [normalizers.NFD()]
451
+ if normalize_numbers:
452
+ norm_list += [normalizers.Replace(Regex("[۱۲۳۴۵۶۷۸۹]"), "۰")]
453
+ if remove_puctuation:
454
+ norm_list += [normalizers.Replace(Regex("[.!؛،؟]"), "")]
455
+ if remove_non_farsi:
456
+ norm_list += [normalizers.Replace(Regex("[^\u060c\u061b\u061f\u0622\u0623\u0624\u0626\u0627"
457
+ "\u0628\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631"
458
+ "\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a"
459
+ "\u0641\u0642\u0644\u0645\u0646\u0647\u0648\u067e\u0686"
460
+ "\u0698\u06a9\u06af\u06cc\u06f0\u06f1\u06f2\u06f3\u06f4"
461
+ "\u06f5\u06f6\u06f7\u06f8\u06f9\\s\u200c\\.\\!]"), "")]
462
  norm_list += [normalizers.Strip()]
463
 
464
  self.normalizer = normalizers.Sequence(norm_list)
 
469
  vocabulary_size: str,
470
  ngram: str,
471
  pruning: str,
472
+ map_to_farsi_alphabet: bool = True,
473
+ normalize_nfd: bool = True,
474
+ normalize_numbers: bool = True,
475
+ remove_puctuation: bool = True,
476
+ remove_non_farsi: bool = True,
477
  ):
478
+ return cls(vocabulary_size,
479
+ ngram,
480
+ pruning,
481
+ map_to_farsi_alphabet,
482
+ normalize_nfd,
483
+ normalize_numbers,
484
+ remove_puctuation,
485
+ remove_non_farsi)
486
 
487
  def score(self, doc: str):
488
  doc = self.normalizer.normalize_str(doc)