Den4ikAI commited on
Commit
153c03b
1 Parent(s): 532937c

Upload 66 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. dictionary/accents.json.gz +3 -0
  3. dictionary/accents_nn.json.gz +3 -0
  4. dictionary/omographs.json.gz +3 -0
  5. dictionary/rule_engine/accents.json +0 -0
  6. dictionary/rule_engine/forms.json +85 -0
  7. dictionary/yo_homographs.json.gz +3 -0
  8. dictionary/yo_omographs.json.gz +3 -0
  9. dictionary/yo_words.json.gz +3 -0
  10. koziev/rulemma/rulemma.dat +3 -0
  11. koziev/rulemma/rulemma.py +237 -0
  12. koziev/rupostagger/__init__.py +3 -0
  13. koziev/rupostagger/database/ruword2tags.db +3 -0
  14. koziev/rupostagger/rupostagger.config +11 -0
  15. koziev/rupostagger/rupostagger.model +3 -0
  16. koziev/rupostagger/rupostagger.py +173 -0
  17. koziev/rupostagger/rusyllab.py +589 -0
  18. koziev/rupostagger/ruword2tags.dat +3 -0
  19. koziev/rupostagger/ruword2tags.py +391 -0
  20. nn/nn_accent/big.onnx +3 -0
  21. nn/nn_accent/config.json +37 -0
  22. nn/nn_accent/model.onnx +3 -0
  23. nn/nn_accent/ort_config.json +30 -0
  24. nn/nn_accent/special_tokens_map.json +6 -0
  25. nn/nn_accent/tokenizer_config.json +10 -0
  26. nn/nn_accent/vocab.txt +45 -0
  27. nn/nn_omograph/big_poetry/added_tokens.json +4 -0
  28. nn/nn_omograph/big_poetry/config.json +31 -0
  29. nn/nn_omograph/big_poetry/model.onnx +3 -0
  30. nn/nn_omograph/big_poetry/special_tokens_map.json +7 -0
  31. nn/nn_omograph/big_poetry/tokenizer.json +0 -0
  32. nn/nn_omograph/big_poetry/tokenizer_config.json +15 -0
  33. nn/nn_omograph/big_poetry/vocab.txt +0 -0
  34. nn/nn_omograph/medium_poetry/added_tokens.json +4 -0
  35. nn/nn_omograph/medium_poetry/config.json +31 -0
  36. nn/nn_omograph/medium_poetry/model.onnx +3 -0
  37. nn/nn_omograph/medium_poetry/special_tokens_map.json +7 -0
  38. nn/nn_omograph/medium_poetry/tokenizer.json +0 -0
  39. nn/nn_omograph/medium_poetry/tokenizer_config.json +15 -0
  40. nn/nn_omograph/medium_poetry/vocab.txt +0 -0
  41. nn/nn_omograph/small_poetry/added_tokens.json +4 -0
  42. nn/nn_omograph/small_poetry/config.json +23 -0
  43. nn/nn_omograph/small_poetry/model.onnx +3 -0
  44. nn/nn_omograph/small_poetry/special_tokens_map.json +7 -0
  45. nn/nn_omograph/small_poetry/tokenizer.json +0 -0
  46. nn/nn_omograph/small_poetry/tokenizer_config.json +15 -0
  47. nn/nn_omograph/small_poetry/vocab.txt +0 -0
  48. nn/nn_omograph/turbo/added_tokens.json +4 -0
  49. nn/nn_omograph/turbo/config.json +28 -0
  50. nn/nn_omograph/turbo/merges.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ koziev/rulemma/rulemma.dat filter=lfs diff=lfs merge=lfs -text
37
+ koziev/rupostagger/database/ruword2tags.db filter=lfs diff=lfs merge=lfs -text
38
+ koziev/rupostagger/ruword2tags.dat filter=lfs diff=lfs merge=lfs -text
dictionary/accents.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa460ebba90de00fbbf3d41d121961f605b98667e45efb7920f127473b15515e
3
+ size 20954156
dictionary/accents_nn.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8395664000b80c1afe09bfea3650945b0933482b8e3dee5bb9d429eb18c44935
3
+ size 845996
dictionary/omographs.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04a9e81c68d65f65ba493fe0110f99e79087548c2beeec3032e2b66e28706f36
3
+ size 219047
dictionary/rule_engine/accents.json ADDED
The diff for this file is too large to render. See raw diff
 
dictionary/rule_engine/forms.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "diminutive": "",
3
+ "perfective/imperfective": "Aspect=Perf|Aspect=Imp",
4
+ "dative/prepositional": "Case=Dat|Case=Prep",
5
+ "inanimate": "Animacy=Inan",
6
+ "animate/inanimate": "Animacy=Anim|Animacy=Inan",
7
+ "dative": "Case=Dat",
8
+ "second-person": "Person=2",
9
+ "imperative": "Mood=Imp",
10
+ "archaic": "",
11
+ "nominative": "Case=Nom",
12
+ "locative": "Case=Loc",
13
+ "masculine": "Gender=Masc",
14
+ "female": "",
15
+ "canonical": "",
16
+ "plural": "Number=Plur",
17
+ "short": "Variant=Short",
18
+ "imperfective": "Aspect=Imp",
19
+ "form": "",
20
+ "augmentative": "",
21
+ "masculine/feminine": "Gender=Masc|Gender=Fem",
22
+ "superlative": "Degree=Sup",
23
+ "nominative/accusative": "Case=Nom|Case=Acc",
24
+ "third-person": "Person=3",
25
+ "nonstandard": "",
26
+ "genitive": "Case=Gen",
27
+ "feminine": "Gender=Fem",
28
+ "masculine/neuter": "Gender=Masc|Gender=Neut",
29
+ "dative/locative": "Case=Dat|Case=Loc",
30
+ "genitive/accusative/prepositional": "Case=Gen|Case=Acc|Case=Prep",
31
+ "partitive": "Case=Par",
32
+ "genitive/prepositional": "Case=Gen|Case=Prep",
33
+ "equivalent": "",
34
+ "endearing": "",
35
+ "degree": "Degree=",
36
+ "comparative": "Degree=Cmp",
37
+ "imperfective/perfective": "Aspect=Imp|Aspect=Perf",
38
+ "mainly": "",
39
+ "passive": "Voice=Pass",
40
+ "first-person": "Person=1",
41
+ "perfective": "Aspect=Perf",
42
+ "genitive/dative/instrumental/prepositional": "Case=Gen|Case=Dat|Case=Ins|Case=Prep",
43
+ "pejorative": "",
44
+ "accusative": "Case=Acc",
45
+ "spelling": "",
46
+ "dative/partitive": "Case=Dat|Case=Par",
47
+ "old-fashion": "",
48
+ "possessive": "Poss=Yes",
49
+ "dative/instrumental": "Case=Dat|Case=Ins",
50
+ "adverbial": "",
51
+ "neuter": "Gender=Neut",
52
+ "future": "Tense=Fut",
53
+ "neuter/masculine": "Gender=Neut|Gender=Masc",
54
+ "inanimate/animate": "Animacy=Inan|Animacy=Anim",
55
+ "(singular": "Number=Sing",
56
+ "alternative,": "",
57
+ "participle": "VerbForm=Part",
58
+ "genitive/accusative": "Case=Gen|Case=Acc",
59
+ "indicative": "Mood=Ind",
60
+ "dative/accusative": "Case=Dat|Case=Acc",
61
+ "singular/plural": "Number=Sing|Number=Plur",
62
+ "instrumental": "Case=Ins",
63
+ "&": "",
64
+ "vocative": "Case=Voc",
65
+ "prepositional": "Case=Prep",
66
+ "active": "Voice=Act",
67
+ "inanimate/animate": "Animacy=Inan|Animacy=Anim",
68
+ "past": "Tense=Past",
69
+ "first/second/third-person": "Person=1|Person=2|Person=3",
70
+ "second-personal": "Person=2",
71
+ "reflexive": "Reflex=Yes",
72
+ "singular": "Number=Sing",
73
+ "accusative/genitive": "Case=Acc|Case=Gen",
74
+ "acronym": "",
75
+ "(animated)": "Animacy=Anim",
76
+ "euphemistic": "",
77
+ "genitive/dative/prepositional": "Case=Gen|Case=Dat|Case=Prep",
78
+ "colloquial": "",
79
+ "a": "",
80
+ "initialism": "",
81
+ "present": "Tense=Pres",
82
+ "obsolete": "",
83
+ "singulative": "",
84
+ "animate": "Animacy=Anim"
85
+ }
dictionary/yo_homographs.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4ee777bbbab87f9eac838f370ad92974e079d02b21903e480c54b5f0c8c60d1
3
+ size 5747
dictionary/yo_omographs.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91cc78dacb5a43e4d5e2e62efdbe5a57799195e5868db35282bee0d9e215a0d
3
+ size 7949
dictionary/yo_words.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19fa89a964a0691d9fe4ee384783e3934904891843d8f59a1c480d67947a82a
3
+ size 548914
koziev/rulemma/rulemma.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf2b3ef3ff7a0aa6e4250aa4e9c8ed568e25f825deebdb12dee1b46b785ba9fc
3
+ size 16703198
koziev/rulemma/rulemma.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Лемматизатор для R&D прототипирования NLP задач в Питоне
4
+ 25.03.2020 добавлена ефикация в get_lemma2
5
+ 05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT
6
+ """
7
+
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+
11
+ import os
12
+ import pickle
13
+ import pathlib
14
+ import gzip
15
+
16
+
17
+ def decode_pos(pos):
18
+ if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']:
19
+ return u'ГЛАГОЛ'
20
+ else:
21
+ return pos
22
+
23
+
24
+ class Lemmatizer(object):
25
+ def __init__(self):
26
+ pass
27
+
28
+ def load(self, dict_path=None):
29
+ """ Загружаем модель лемматизации, созданную отдельным скриптом builder.py """
30
+ dict_filename = 'rulemma.dat'
31
+ if dict_path is None:
32
+ module_folder = str(pathlib.Path(__file__).resolve().parent)
33
+ p = os.path.join(module_folder, '../tmp', dict_filename)
34
+ if not os.path.exists(p):
35
+ p = os.path.join(module_folder, dict_filename)
36
+ else:
37
+ p = dict_path
38
+
39
+ with gzip.open(p, 'r') as f:
40
+ self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f)
41
+
42
+ def get_lemma(self, word):
43
+ if word in self.forms:
44
+ return self.forms[word]
45
+ elif word in self.forms2:
46
+ return self.forms2[word][0]
47
+ elif word in self.special_lemmas:
48
+ return self.special_lemmas[word]
49
+ else:
50
+ return word
51
+
52
+ def decode_pos_tags(self, pos_tags):
53
+ stags1 = []
54
+ part_of_speech = u'unk'
55
+ short_tag_index = -1
56
+ for tag in pos_tags.split('|'):
57
+ if tag == 'NOUN':
58
+ part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ'
59
+ elif tag == 'VERB':
60
+ part_of_speech = u'ГЛАГОЛ'
61
+ elif tag == 'ADJ':
62
+ part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ'
63
+ stags1.append((u'КРАТКИЙ', u'0'))
64
+ short_tag_index = 0
65
+ elif tag == 'ADV':
66
+ part_of_speech = u'НАРЕЧИЕ'
67
+ elif tag == 'PRON':
68
+ part_of_speech = u'МЕСТОИМЕНИЕ'
69
+ elif tag == 'ADP':
70
+ part_of_speech = u'ПРЕДЛОГ'
71
+ elif tag == 'CONJ':
72
+ part_of_speech = u'СОЮЗ'
73
+ elif tag == 'PART':
74
+ part_of_speech = u'ЧАСТИЦА'
75
+ elif tag == 'PUNCT':
76
+ part_of_speech = u'ПУНКТУАТОР'
77
+ elif '=' in tag:
78
+ if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ':
79
+ if tag == u'Case=Nom':
80
+ stags1.append((u'ПАДЕЖ', u'ИМ'))
81
+ elif tag == u'Case=Acc':
82
+ stags1.append((u'ПАДЕЖ', u'ВИН'))
83
+ elif tag == u'Case=Dat':
84
+ stags1.append((u'ПАДЕЖ', u'ДАТ'))
85
+ elif tag == u'Case=Ins':
86
+ stags1.append((u'ПАДЕЖ', u'ТВОР'))
87
+ elif tag == u'Case=Prep':
88
+ stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
89
+ elif tag == u'Case=Loc':
90
+ stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
91
+ elif tag == u'Case=Gen':
92
+ stags1.append((u'ПАДЕЖ', u'РОД'))
93
+ elif tag == u'Case=Voc':
94
+ stags1.append((u'ПАДЕЖ', u'ЗВАТ'))
95
+ elif tag == u'Number=Sing':
96
+ stags1.append((u'ЧИСЛО', u'ЕД'))
97
+ elif tag == u'Number=Plur':
98
+ stags1.append((u'ЧИСЛО', u'МН'))
99
+ elif tag == u'Gender=Masc':
100
+ stags1.append((u'РОД', u'МУЖ'))
101
+ elif tag == u'Gender=Fem':
102
+ stags1.append((u'РОД', u'ЖЕН'))
103
+ elif tag == u'Gender=Neut':
104
+ stags1.append((u'РОД', u'СР'))
105
+ else:
106
+ print(u'неизвестный тэг "{}"'.format(tag))
107
+ raise NotImplementedError()
108
+ elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ':
109
+ if tag == u'Case=Nom':
110
+ stags1.append((u'ПАДЕЖ', u'ИМ'))
111
+ elif tag == u'Case=Acc':
112
+ stags1.append((u'ПАДЕЖ', u'ВИН'))
113
+ elif tag == u'Case=Dat':
114
+ stags1.append((u'ПАДЕЖ', u'ДАТ'))
115
+ elif tag == u'Case=Ins':
116
+ stags1.append((u'ПАДЕЖ', u'ТВОР'))
117
+ elif tag == u'Case=Prep':
118
+ stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
119
+ elif tag == u'Case=Loc':
120
+ stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
121
+ elif tag == u'Case=Gen':
122
+ stags1.append((u'ПАДЕЖ', u'РОД'))
123
+ elif tag == u'Number=Sing':
124
+ stags1.append((u'ЧИСЛО', u'ЕД'))
125
+ elif tag == u'Number=Plur':
126
+ stags1.append((u'ЧИСЛО', u'МН'))
127
+ elif tag == u'Gender=Masc':
128
+ stags1.append((u'РОД', u'МУЖ'))
129
+ elif tag == u'Gender=Fem':
130
+ stags1.append((u'РОД', u'ЖЕН'))
131
+ elif tag == u'Gender=Neut':
132
+ stags1.append((u'РОД', u'СР'))
133
+ elif tag == u'Degree=Cmp':
134
+ stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
135
+ elif tag == u'Degree=Pos':
136
+ stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
137
+ elif tag in (u'Variant=Short', u'Variant=Brev'):
138
+ stags1[short_tag_index] = (u'КРАТКИЙ', u'1')
139
+ else:
140
+ print(u'неизвестный тэг "{}"'.format(tag))
141
+ raise NotImplementedError()
142
+ elif part_of_speech == u'ГЛАГОЛ':
143
+ if tag == u'Number=Sing':
144
+ stags1.append((u'ЧИСЛО', u'ЕД'))
145
+ elif tag == u'Number=Plur':
146
+ stags1.append((u'ЧИСЛО', u'МН'))
147
+ elif tag == u'Gender=Masc':
148
+ stags1.append((u'РОД', u'МУЖ'))
149
+ elif tag == u'Gender=Fem':
150
+ stags1.append((u'РОД', u'ЖЕН'))
151
+ elif tag == u'Gender=Neut':
152
+ stags1.append((u'РОД', u'СР'))
153
+ elif tag == u'Mood=Ind':
154
+ stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ'))
155
+ elif tag == u'Mood=Imp':
156
+ stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД'))
157
+ elif tag == u'Tense=Past':
158
+ stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ'))
159
+ elif tag == u'Tense=Fut':
160
+ stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ'))
161
+ elif tag == u'Tense=Notpast':
162
+ stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
163
+ elif tag == u'Tense=Pres':
164
+ stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
165
+ elif tag == u'Person=1':
166
+ stags1.append((u'ЛИЦО', u'1'))
167
+ elif tag == u'Person=2':
168
+ stags1.append((u'ЛИЦО', u'2'))
169
+ elif tag == u'Person=3':
170
+ stags1.append((u'ЛИЦО', u'3'))
171
+ elif tag == u'VerbForm=Fin':
172
+ pass
173
+ elif tag == u'VerbForm=Inf':
174
+ pass
175
+ elif tag == u'VerbForm=Conv':
176
+ pass
177
+ else:
178
+ msg = u'неизвестный тэг "{}"'.format(tag)
179
+ raise RuntimeError(msg)
180
+ elif part_of_speech == u'НАРЕЧИЕ':
181
+ if tag == u'Degree=Pos':
182
+ stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
183
+ elif tag == u'Degree=Cmp':
184
+ stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
185
+ else:
186
+ raise NotImplementedError()
187
+ else:
188
+ pass
189
+
190
+ return part_of_speech, stags1
191
+
192
+ def get_lemma2(self, word, pos_tags):
193
+ part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags)
194
+
195
+ nword = word.lower().replace('ё', 'е')
196
+
197
+ if nword in self.special_lemmas:
198
+ return self.special_lemmas[nword], part_of_speech, decoded_tags
199
+
200
+ if nword in self.forms:
201
+ lemma = self.forms[nword]
202
+ return lemma, part_of_speech, decoded_tags
203
+ elif nword in self.forms2:
204
+ if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ':
205
+ # Для существительных учитываем падеж.
206
+ required_case = None
207
+ for tag in decoded_tags:
208
+ if tag[0] == 'ПАДЕЖ':
209
+ required_case = tag[1]
210
+ break
211
+
212
+ for lemma, lemma_part_of_speech, tag in self.forms2[nword]:
213
+ if lemma_part_of_speech == part_of_speech and tag == required_case:
214
+ return lemma, part_of_speech, decoded_tags
215
+ else:
216
+ for lemma, lemma_part_of_speech, tags in self.forms2[nword]:
217
+ if lemma_part_of_speech == part_of_speech:
218
+ return lemma, part_of_speech, decoded_tags
219
+ elif len(word) > 4:
220
+ # используем модель лемматизации для OV-слов
221
+ ending = nword[-4:]
222
+ key = ending + u'|' + part_of_speech
223
+ if key in self.key2transducer:
224
+ transducer = self.key2transducer[key]
225
+ if transducer[0] > 0:
226
+ lemma = word[:-transducer[0]] + transducer[1]
227
+ else:
228
+ lemma = word + transducer[1]
229
+
230
+ return lemma.lower(), part_of_speech, decoded_tags
231
+
232
+ # fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы
233
+ return nword, part_of_speech, decoded_tags
234
+
235
+ def lemmatize(self, tagged_words):
236
+ """Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи"""
237
+ return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]
koziev/rupostagger/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from __future__ import absolute_import
2
+ from .rupostagger import RuPosTagger
3
+ from .rupostagger import run_tests
koziev/rupostagger/database/ruword2tags.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06848e656bef642aafb4440c03554fa78f2f32dde92ea66f3f86ce9977b167e
3
+ size 168816640
koziev/rupostagger/rupostagger.config ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "w2v_filename": "w2v.CBOW=1_WIN=5_DIM=64.bin",
3
+ "wc2v_filename": "wordchar2vector.dat",
4
+ "winspan": 3,
5
+ "use_w2v": false,
6
+ "use_gren": true,
7
+ "use_syllabs": false,
8
+ "use_shingles": false,
9
+ "ending_len": 0,
10
+ "model_filename": "rupostagger.model"
11
+ }
koziev/rupostagger/rupostagger.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b7b0bfd7427b5fdc1604052176db8aa3b139b3ce03be440cfce48536f8e5ef
3
+ size 2417464
koziev/rupostagger/rupostagger.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Модель частеречной разметки для русскоязычных текстов (проект https://github.com/Koziev/rupostagger)
4
+ 03.08.2019 небольшой баг с нормализацией (замена "ё" на "е") перед поиском в грамматическом словаре
5
+ """
6
+
7
+ from __future__ import print_function
8
+ from __future__ import division # for python2 compatibility
9
+
10
+ import os
11
+ import json
12
+ import pathlib
13
+ import re
14
+
15
+ import pycrfsuite
16
+ from .ruword2tags import RuWord2Tags
17
+ from .rusyllab import split_word
18
+
19
+
20
+ BEG_TOKEN = '<beg>'
21
+ END_TOKEN = '<end>'
22
+
23
+ token2tag = {BEG_TOKEN: BEG_TOKEN, END_TOKEN: END_TOKEN}
24
+
25
+
26
+ def is_num(token):
27
+ return re.match('^[0-9]+$', token)
28
+
29
+
30
+ class RuPosTagger(object):
31
+ def __init__(self):
32
+ self.winspan = -1
33
+ self.use_w2v = -1
34
+ self.use_syllabs = -1
35
+ self.ending_len = -1
36
+ self.word2tags = None
37
+
38
+ def load(self, word2tags_path=None):
39
+ module_folder = str(pathlib.Path(__file__).resolve().parent)
40
+ data_folder = os.path.join(module_folder, '../tmp')
41
+
42
+ config_path = os.path.join(data_folder, 'rupostagger.config')
43
+ if not os.path.exists(config_path):
44
+ data_folder = module_folder
45
+ config_path = os.path.join(data_folder, 'rupostagger.config')
46
+
47
+ #print('DEBUG@47 module_folder={}'.format(module_folder))
48
+ #print('DEBUG@48 data_folder={}'.format(data_folder))
49
+
50
+ with open(config_path, 'r') as rdr:
51
+ self.config = json.load(rdr)
52
+ self.winspan = self.config['winspan']
53
+ self.use_gren = self.config['use_gren']
54
+ self.use_w2v = self.config['use_w2v']
55
+ self.use_syllabs = self.config['use_syllabs']
56
+ self.ending_len = self.config['ending_len']
57
+
58
+ self.word2tags = RuWord2Tags()
59
+ self.word2tags.load(word2tags_path)
60
+
61
+ model_path = os.path.join(data_folder, 'rupostagger.model')
62
+ self.tagger = pycrfsuite.Tagger()
63
+ self.tagger.open(model_path)
64
+
65
+ @staticmethod
66
+ def __normalize_word(word):
67
+ return word.replace(' - ', '-').replace(u'ё', u'е').lower()
68
+
69
+ def get_word_features(self, word, prefix):
70
+ assert(len(word) > 0)
71
+ features = []
72
+ if word in token2tag:
73
+ features.append((u'tag[{}]={}'.format(prefix, token2tag[word]), 1.0))
74
+ elif is_num(word):
75
+ features.append((u'tag[{}]=<num> tag[{}]=<num_{}>'.format(prefix, prefix, word[-1]), 1.0))
76
+ elif len(word) == 1 and word[0] in u'‼≠™®•·[¡+<>`~;.,‚?!-…№”“„{}|‹›/\'"–—_:«»*]()‘’≈':
77
+ features.append((u'tag[{}]=punct_{}'.format(prefix, ord(word[0])), 1.0))
78
+ else:
79
+ uword = self.__normalize_word(word)
80
+ first_char = word[0]
81
+ if first_char in u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
82
+ features.append((u'word[{}]=<latin>'.format(prefix), 1.0))
83
+ else:
84
+ if first_char in u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ':
85
+ features.append((u'word[{}]=<upper1>'.format(prefix), 1.0))
86
+
87
+ if self.ending_len > 0:
88
+ ending = '~' + uword[-self.ending_len:] if len(uword) > self.ending_len else uword
89
+ features.append((u'ending[{}]={}'.format(prefix, ending), 1.0))
90
+
91
+ if self.use_syllabs and first_char.lower() in u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя':
92
+ syllabs = split_word(uword)
93
+ if len(syllabs) > 0:
94
+ if len(syllabs) == 1:
95
+ features.append((u'slb[{}]={}'.format(prefix, syllabs[0] + '~'), 1.0))
96
+ else:
97
+ features.append((u'slb[{}]={}'.format(prefix, syllabs[0]+'~'), 1.0))
98
+ for s in syllabs[1:-1]:
99
+ features.append((u'slb[{}]={}'.format(prefix, '~'+s+'~'), 1.0))
100
+ features.append((u'slb[{}]={}'.format(prefix, '~'+syllabs[-1]), 1.0))
101
+
102
+ if self.use_gren:
103
+ tags = set()
104
+ for tagset in self.word2tags[uword]:
105
+ tags.update(tagset.split(' '))
106
+
107
+ for tag in tags:
108
+ features.append((u'tag[{}]={}'.format(prefix, tag), 1.0))
109
+
110
+ return features
111
+
112
+ def vectorize_sample(self, words):
113
+ lines2 = []
114
+ nb_words = len(words)
115
+ for iword, word in enumerate(words):
116
+ word_features = dict()
117
+ for j in range(-self.winspan, self.winspan + 1):
118
+ iword2 = iword + j
119
+ if iword2 < 0:
120
+ features = [('word[{}]=<beg>'.format(j), 1.0)]
121
+ elif iword2 >= nb_words:
122
+ features = [('word[{}]=<end>'.format(j), 1.0)]
123
+ else:
124
+ features = self.get_word_features(words[iword2], str(j))
125
+ word_features.update(features)
126
+
127
+ lines2.append(word_features)
128
+
129
+ return lines2
130
+
131
+ def tag(self, words):
132
+ #X = self.vectorize_sample([BEG_TOKEN]+words+[END_TOKEN])
133
+ X = self.vectorize_sample(words)
134
+ y_pred = self.tagger.tag(X)
135
+ #return zip(words, y_pred[1: -1])
136
+ return zip(words, y_pred)
137
+
138
+
139
+ def test1(tagger, phrase, required_labels):
140
+ pred_labels = list(tagger.tag(phrase.split()))
141
+ assert(len(required_labels.split()) == len(pred_labels))
142
+ for required_label, (word, pred_label) in zip(required_labels.split(), pred_labels):
143
+ for tag in required_label.split('|'):
144
+ if tag not in pred_label:
145
+ print(u'Error: phrase={} word={} required_label={} pred_label={}'.format(phrase, word, required_label, pred_label))
146
+ return False
147
+
148
+ return True
149
+
150
+
151
+ def run_tests():
152
+ tagger = RuPosTagger()
153
+ tagger.load()
154
+
155
+ for phrase, required_labels in [(u'Кошки спят', u'NOUN|Number=Plur|Case=Nom VERB|Mood=Ind|Number=Plur|Person=3|Tense=Notpast|VerbForm=Fin'),
156
+ (u'Я рою колодец', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
157
+ (u'Я мою окно', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
158
+ (u'Ира мыла окно', u'NOUN|Case=Nom VERB NOUN|Number=Sing|Case=Acc'),
159
+ (u'Возьми мою пилу', u'VERB ADJ|Case=Acc NOUN|Case=Acc'),
160
+ (u'рой колодец', u'VERB NOUN|Number=Sing|Case=Acc'),
161
+ (u'У меня живёт черепаха', u'ADP PRON VERB NOUN'),
162
+ (u'какую еду ты любишь ?', u'ADJ NOUN PRON VERB PUNCT')
163
+ ]:
164
+ if not test1(tagger, phrase, required_labels):
165
+ print('Tests FAILED')
166
+ return
167
+
168
+ print('Tests PASSED OK')
169
+
170
+
171
+ if __name__ == '__main__':
172
+ run_tests()
173
+
koziev/rupostagger/rusyllab.py ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # autogenerated 2019-01-19 10:52:09.746954
4
+
5
+
6
+ def V(c):
7
+ return c in u"АЕЁИОУЫЭЮЯаеёиоуыэюя"
8
+
9
+
10
+ def C(c):
11
+ return c in u"БВГДЖЗКЛМНПРСТФХЦЧШЩбвгджзклмнпрстфхцчшщ"
12
+
13
+
14
+ def S(c):
15
+ return c in u"Йй"
16
+
17
+
18
+ def M(c):
19
+ return c in u"ЪЬъь"
20
+
21
+
22
+ def BEG(c):
23
+ return c == u"["
24
+
25
+
26
+ def END(c):
27
+ return c == u"]"
28
+
29
+
30
+ def split(s):
31
+ cur_pos = 0
32
+ items = list(u"[" + s + u"]")
33
+ while cur_pos < len(items):
34
+ input_context = items[cur_pos:]
35
+ res = apply1(input_context)
36
+ if res is None:
37
+ cur_pos += 1
38
+ else:
39
+ items = items[:cur_pos] + res[0] + input_context[res[1]:]
40
+ cur_pos += res[2]
41
+ return items[1:-1]
42
+
43
+
44
+ def apply1(s):
45
+ if C(s[0]):
46
+ if V(s[1]):
47
+ if C(s[2]):
48
+ if V(s[3]):
49
+ return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_1
50
+
51
+ if C(s[3]):
52
+ if V(s[4]):
53
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_5
54
+
55
+ if C(s[4]):
56
+ if C(s[5]):
57
+ if END(s[6]):
58
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_11
59
+
60
+ if not END(s[6]):
61
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_12
62
+
63
+
64
+ if V(s[5]):
65
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_36
66
+
67
+ if END(s[5]):
68
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_120
69
+
70
+ if M(s[5]):
71
+ if END(s[6]):
72
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_330
73
+
74
+
75
+
76
+ if END(s[4]):
77
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_52
78
+
79
+ if M(s[4]):
80
+ if END(s[5]):
81
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_76
82
+
83
+ if C(s[5]):
84
+ if V(s[6]):
85
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_250
86
+
87
+
88
+ if V(s[5]):
89
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_260
90
+
91
+
92
+
93
+ if END(s[3]):
94
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_6
95
+
96
+ if M(s[3]):
97
+ if C(s[4]):
98
+ if not END(s[5]):
99
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_13
100
+
101
+ if END(s[5]):
102
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_39
103
+
104
+ if C(s[5]):
105
+ if C(s[6]):
106
+ if END(s[7]):
107
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_350
108
+
109
+
110
+
111
+
112
+ if END(s[4]):
113
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_14
114
+
115
+ if V(s[4]):
116
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_20
117
+
118
+
119
+
120
+ if END(s[2]):
121
+ return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_7
122
+
123
+ if S(s[2]):
124
+ if C(s[3]):
125
+ if V(s[4]):
126
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_8
127
+
128
+ if C(s[4]):
129
+ if END(s[5]):
130
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_9
131
+
132
+
133
+ if END(s[4]):
134
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_280
135
+
136
+ if M(s[4]):
137
+ if END(s[5]):
138
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_400
139
+
140
+
141
+
142
+ if END(s[3]):
143
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_10
144
+
145
+ return ([s[0]+s[1]+s[2]], 3, 1) # SYLLABER_64
146
+
147
+ if V(s[2]):
148
+ return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_31
149
+
150
+
151
+ if C(s[1]):
152
+ if C(s[2]):
153
+ if V(s[3]):
154
+ if C(s[4]):
155
+ if C(s[5]):
156
+ if V(s[6]):
157
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_2
158
+
159
+ if M(s[6]):
160
+ if END(s[7]):
161
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_310
162
+
163
+
164
+
165
+ if END(s[5]):
166
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_3
167
+
168
+ if V(s[5]):
169
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_4
170
+
171
+ if M(s[5]):
172
+ if C(s[6]):
173
+ if M(s[7]):
174
+ if END(s[8]):
175
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6]+s[7], s[8]], 9, 1) # SYLLABER_300
176
+
177
+
178
+
179
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_200
180
+
181
+
182
+ if S(s[4]):
183
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_54
184
+
185
+ if V(s[4]):
186
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_68
187
+
188
+ if END(s[4]):
189
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_170
190
+
191
+ return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_210
192
+
193
+ if C(s[3]):
194
+ if V(s[4]):
195
+ if S(s[5]):
196
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_220
197
+
198
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_98
199
+
200
+
201
+
202
+ if V(s[2]):
203
+ if C(s[3]):
204
+ if C(s[4]):
205
+ if V(s[5]):
206
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_15
207
+
208
+ if C(s[5]):
209
+ if C(s[6]):
210
+ if END(s[7]):
211
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_370
212
+
213
+
214
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_80
215
+
216
+ if M(s[5]):
217
+ if V(s[6]):
218
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_340
219
+
220
+ if C(s[6]):
221
+ if V(s[7]):
222
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_390
223
+
224
+
225
+
226
+ if END(s[5]):
227
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_470
228
+
229
+
230
+ if M(s[4]):
231
+ if not C(s[5]):
232
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_21
233
+
234
+ if C(s[5]):
235
+ if V(s[6]):
236
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_48
237
+
238
+ if C(s[6]):
239
+ if V(s[7]):
240
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6], s[7]], 8, 1) # SYLLABER_240
241
+
242
+
243
+
244
+
245
+ if END(s[4]):
246
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_62
247
+
248
+ if V(s[4]):
249
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_230
250
+
251
+
252
+ if V(s[3]):
253
+ if C(s[4]):
254
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_17
255
+
256
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_82
257
+
258
+ if S(s[3]):
259
+ if END(s[4]):
260
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_33
261
+
262
+ if C(s[4]):
263
+ if V(s[5]):
264
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_92
265
+
266
+ if C(s[5]):
267
+ if C(s[6]):
268
+ if END(s[7]):
269
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_450
270
+
271
+
272
+
273
+
274
+ return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_190
275
+
276
+ if END(s[3]):
277
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_66
278
+
279
+
280
+ if M(s[2]):
281
+ if V(s[3]):
282
+ if END(s[4]):
283
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_410
284
+
285
+ if C(s[4]):
286
+ if V(s[5]):
287
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_480
288
+
289
+
290
+
291
+
292
+
293
+ if M(s[1]):
294
+ if V(s[2]):
295
+ if C(s[3]):
296
+ if V(s[4]):
297
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_16
298
+
299
+ if C(s[4]):
300
+ if END(s[5]):
301
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_19
302
+
303
+ if V(s[5]):
304
+ return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_290
305
+
306
+ if C(s[5]):
307
+ if C(s[6]):
308
+ if V(s[7]):
309
+ return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_430
310
+
311
+
312
+
313
+
314
+ if END(s[4]):
315
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_22
316
+
317
+
318
+ if END(s[3]):
319
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_94
320
+
321
+
322
+ if C(s[2]):
323
+ if V(s[3]):
324
+ if S(s[4]):
325
+ if END(s[5]):
326
+ return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_320
327
+
328
+
329
+ if V(s[4]):
330
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_360
331
+
332
+
333
+
334
+
335
+
336
+
337
+ if V(s[0]):
338
+ if C(s[1]):
339
+ if C(s[2]):
340
+ if END(s[3]):
341
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_18
342
+
343
+ if V(s[3]):
344
+ return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_28
345
+
346
+ if C(s[3]):
347
+ if V(s[4]):
348
+ if C(s[5]):
349
+ return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_96
350
+
351
+ return ([s[0]+s[1], s[2], s[3], s[4]], 5, 1) # SYLLABER_50
352
+
353
+ if C(s[4]):
354
+ if V(s[5]):
355
+ return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_460
356
+
357
+
358
+
359
+ if M(s[3]):
360
+ if END(s[4]):
361
+ return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_72
362
+
363
+
364
+
365
+ if V(s[2]):
366
+ return ([s[0], s[1], s[2]], 3, 1) # SYLLABER_35
367
+
368
+ if M(s[2]):
369
+ if END(s[3]):
370
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_40
371
+
372
+ if C(s[3]):
373
+ if C(s[4]):
374
+ if V(s[5]):
375
+ return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_42
376
+
377
+
378
+ if V(s[4]):
379
+ return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_84
380
+
381
+
382
+ if V(s[3]):
383
+ return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_78
384
+
385
+
386
+ if END(s[2]):
387
+ return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_44
388
+
389
+ return ([s[0]+s[1]], 2, 1) # SYLLABER_56
390
+
391
+ if END(s[1]):
392
+ return ([s[0], s[1]], 2, 1) # SYLLABER_30
393
+
394
+ if V(s[1]):
395
+ return ([s[0], s[1]], 2, 1) # SYLLABER_34
396
+
397
+ if S(s[1]):
398
+ if END(s[2]):
399
+ return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_46
400
+
401
+ if C(s[2]):
402
+ if V(s[3]):
403
+ return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_180
404
+
405
+
406
+
407
+
408
+
409
+ if BEG(s[0]):
410
+ if C(s[1]):
411
+ if C(s[2]):
412
+ if V(s[3]):
413
+ if C(s[4]):
414
+ if END(s[5]):
415
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_23
416
+
417
+ if C(s[5]):
418
+ if END(s[6]):
419
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_60
420
+
421
+ if M(s[6]):
422
+ if END(s[7]):
423
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_74
424
+
425
+
426
+
427
+
428
+ if S(s[4]):
429
+ if END(s[5]):
430
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_24
431
+
432
+
433
+ if END(s[4]):
434
+ return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_27
435
+
436
+
437
+ if END(s[3]):
438
+ return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_70
439
+
440
+ if C(s[3]):
441
+ if C(s[4]):
442
+ if V(s[5]):
443
+ if C(s[6]):
444
+ if END(s[7]):
445
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_88
446
+
447
+
448
+
449
+
450
+ if V(s[4]):
451
+ if C(s[5]):
452
+ if M(s[6]):
453
+ if END(s[7]):
454
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_90
455
+
456
+
457
+
458
+ if END(s[5]):
459
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_140
460
+
461
+
462
+
463
+
464
+ if V(s[2]):
465
+ if C(s[3]):
466
+ if C(s[4]):
467
+ if M(s[5]):
468
+ if END(s[6]):
469
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_26
470
+
471
+
472
+ if END(s[5]):
473
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_37
474
+
475
+
476
+ if M(s[4]):
477
+ if C(s[5]):
478
+ if C(s[6]):
479
+ if END(s[7]):
480
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_440
481
+
482
+
483
+
484
+
485
+
486
+ if S(s[3]):
487
+ if C(s[4]):
488
+ if END(s[5]):
489
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_160
490
+
491
+
492
+
493
+
494
+ if END(s[2]):
495
+ return ([s[0], s[1], s[2]], 3, 2) # SYLLABER_32
496
+
497
+ if M(s[2]):
498
+ if C(s[3]):
499
+ if V(s[4]):
500
+ if END(s[5]):
501
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_58
502
+
503
+ if C(s[5]):
504
+ if END(s[6]):
505
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_100
506
+
507
+ if V(s[6]):
508
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 2) # SYLLABER_420
509
+
510
+
511
+
512
+
513
+ if V(s[3]):
514
+ if END(s[4]):
515
+ return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_86
516
+
517
+ if S(s[4]):
518
+ if END(s[5]):
519
+ return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_110
520
+
521
+
522
+ if C(s[4]):
523
+ if M(s[5]):
524
+ if END(s[6]):
525
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_150
526
+
527
+
528
+
529
+
530
+
531
+
532
+ if V(s[1]):
533
+ if C(s[2]):
534
+ if M(s[3]):
535
+ if END(s[4]):
536
+ return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_25
537
+
538
+
539
+ if END(s[3]):
540
+ return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_29
541
+
542
+ if C(s[3]):
543
+ if C(s[4]):
544
+ if C(s[5]):
545
+ if END(s[6]):
546
+ return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_130
547
+
548
+
549
+
550
+
551
+
552
+
553
+ if S(s[1]):
554
+ if V(s[2]):
555
+ if C(s[3]):
556
+ if V(s[4]):
557
+ return ([s[0], s[1]+s[2], s[3], s[4]], 5, 2) # SYLLABER_380
558
+
559
+
560
+
561
+
562
+
563
+
564
+ if __name__ == "__main__":
565
+ sx = split(u"спросил")
566
+ print(u"|".join(sx))
567
+
568
+ def split_word(word):
569
+ """
570
+ Split single word to syllables
571
+ :param word: unicode string representing Russian word
572
+ :return: list of unicode strings for syllables
573
+ """
574
+ return split(word)
575
+
576
+
577
+ def split_words(words):
578
+ """
579
+ Split the words in list to contiguous list of sillables and word separators (single space chars)
580
+ :param words: list of words (unicode strings)
581
+ :return: list of tokens - syllables and spaces
582
+ """
583
+ tokens = []
584
+ for word in words:
585
+ sx = split(word)
586
+ if len(tokens) > 0:
587
+ tokens.append(u' ')
588
+ tokens.extend(sx)
589
+ return tokens
koziev/rupostagger/ruword2tags.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dde47b5f1d48ff899887ac07812dcabd2966e48e84646f3065bfd06627c2af58
3
+ size 9683765
koziev/rupostagger/ruword2tags.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 19.04.2019 - при парсинге словарной базы Solarix пропускаются словоформы с
4
+ отрицательным скорингом (неупотребимые слова).
5
+
6
+ 26-10-2019 - переход на хранение части словарной базы в SQLite3
7
+
8
+ 17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
9
+
10
+ 13.06.2022 если файла БД ruword2tags.db нет, скачаем его и оставим в домашнем каталоге пользователя
11
+ """
12
+
13
+ import gzip
14
+ import pathlib
15
+ import os
16
+ import pickle
17
+ import io
18
+ import argparse
19
+ import sqlite3
20
+ import threading
21
+
22
+
23
+ def create_trie_node(char):
24
+ return char, [], dict()
25
+
26
+
27
+ def add_to_trie_node(node, next_chars, tagset_index):
28
+ if len(next_chars) == 0:
29
+ node[1].append(tagset_index)
30
+ else:
31
+ next_char = next_chars[0]
32
+ if next_char not in node[2]:
33
+ node[2][next_char] = create_trie_node(next_char)
34
+
35
+ add_to_trie_node(node[2][next_char], next_chars[1:], tagset_index)
36
+
37
+
38
+ def find_tagsets_in_trie_node(node, word):
39
+ if word:
40
+ found_tagsets = []
41
+ next_char = word[0]
42
+ if next_char in node[2]:
43
+ found_tagsets.extend(find_tagsets_in_trie_node(node[2][next_char], word[1:]))
44
+ return found_tagsets
45
+ else:
46
+ return node[1]
47
+
48
+
49
+ def trie_constructed(trie_node, tagset2id):
50
+ tagset = tuple(sorted(trie_node[1]))
51
+ if tagset in tagset2id:
52
+ id_tagsets = tagset2id[tagset]
53
+ else:
54
+ id_tagsets = len(tagset2id) + 1
55
+ tagset2id[tagset] = id_tagsets
56
+
57
+ new_children = dict()
58
+ for next_char, child in trie_node[2].items():
59
+ new_children[next_char] = trie_constructed(child, tagset2id)
60
+
61
+ return (trie_node[0], id_tagsets, new_children)
62
+
63
+
64
+
65
+ class RuWord2Tags:
66
+ dict_filename = 'ruword2tags.dat'
67
+
68
+ def __init__(self):
69
+ self.ending_len = None
70
+ self.index2tagset = None
71
+ self.ending2tagsets = None
72
+ self.trie_root = None
73
+ self.all_ending2tagsets = None
74
+ self.trie_tagsets = None
75
+ self.db_filepath = None
76
+ self.cnx = None
77
+ self.lock = threading.Lock()
78
+ self.word2tagsets_cache = dict()
79
+
80
+ def load(self, dict_path=None):
81
+ module_folder = str(pathlib.Path(__file__).resolve().parent)
82
+ self.db_filepath = os.path.join(module_folder, 'database', 'ruword2tags.db')
83
+ try:
84
+ # 17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
85
+ self.cnx = sqlite3.connect(self.db_filepath, check_same_thread=False)
86
+ except Exception as ex:
87
+ msg = u'Could not open db file "{}", error: {}'.format(self.db_filepath, ex)
88
+ raise RuntimeError(msg)
89
+
90
+ self.cnx.isolation_level = None
91
+ self.cur = self.cnx.cursor()
92
+
93
+ with open(os.path.join(module_folder,"ruword2tags.dat"), 'rb') as f:
94
+ data = pickle.load(f)
95
+ self.ending_lens = data['ending_lens']
96
+ self.index2tagset = data['index2tagset']
97
+ self.ending2tagsets = data['ending2tagsets']
98
+ self.all_ending2tagsets = data['all_ending2tagsets']
99
+ self.id2tagsets = data['id2tagsets']
100
+
101
+ if False:
102
+ trie_filepath = os.path.join(os.path.dirname(p), 'ruword2tags_trie.dat')
103
+ with gzip.open(trie_filepath, 'r') as f:
104
+ self.trie_root = pickle.load(f)
105
+
106
+
107
+ def __getitem__(self, word):
108
+ hit = False
109
+ for ending_len in self.ending_lens:
110
+ ending = word[-ending_len:] if len(word) > ending_len else u''
111
+ if ending in self.ending2tagsets:
112
+ for itagset in self.ending2tagsets[ending]:
113
+ yield self.index2tagset[itagset]
114
+ hit = True
115
+ break
116
+
117
+ if not hit:
118
+ #for itagset in find_tagsets_in_trie_node(self.trie_root, word):
119
+ # hit = True
120
+ # yield self.index2tagset[itagset]
121
+
122
+ if word in self.word2tagsets_cache:
123
+ id_tagsets = self.word2tagsets_cache[word]
124
+ for itagset in self.id2tagsets[id_tagsets]:
125
+ yield self.index2tagset[itagset]
126
+ hit = True
127
+ else:
128
+ with self.lock: # для многопоточной работы в чатботе
129
+ for r in self.cur.execute('SELECT id_tagsets FROM word_tagsets WHERE word=:word', {'word': word}):
130
+ id_tagsets = int(r[0])
131
+ self.word2tagsets_cache[word] = id_tagsets
132
+ for itagset in self.id2tagsets[id_tagsets]:
133
+ yield self.index2tagset[itagset]
134
+ hit = True
135
+
136
+ if not hit:
137
+ for ending_len in reversed(self.ending_lens):
138
+ ending = word[-ending_len:] if len(word) > ending_len else u''
139
+ if ending in self.all_ending2tagsets:
140
+ for itagset in self.all_ending2tagsets[ending]:
141
+ yield self.index2tagset[itagset]
142
+ hit = True
143
+ break
144
+
145
+
146
+ def run_tests(dict_path=None):
147
+ print('Start testing...')
148
+ word2tags = RuWord2Tags()
149
+ word2tags.load(dict_path)
150
+
151
+ cases = [(u'очень', [u'НАРЕЧИЕ СТЕПЕНЬ=АТРИБ ТИП_МОДИФ=ГЛАГ ТИП_МОДИФ=НАРЕЧ ТИП_МОДИФ=ПРИЛ']),
152
+ (u'поскорее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
153
+ (u'поскорей', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
154
+ (u'сильнее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 СТЕПЕНЬ=СРАВН']),
155
+ (u'синее', [u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ВИН РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ИМ РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД']),
156
+ (u'трахее', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ДАТ РОД=ЖЕН ЧИСЛО=ЕД', u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ПРЕДЛ РОД=ЖЕН ЧИСЛО=ЕД']),
157
+ (u'полдня', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ИМ ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
158
+ u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ВИН ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
159
+ u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=РОД ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД'
160
+ ]),
161
+ (u'а', [u'СОЮЗ', u'ЧАСТИЦА']),
162
+ (u'кошки', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=ИМ РОД=ЖЕН ЧИСЛО=МН',
163
+ u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=РОД РОД=ЖЕН ЧИСЛО=ЕД']),
164
+ (u'на', [#u'ГЛАГОЛ ВИД=НЕСОВЕРШ ЛИЦО=2 НАКЛОНЕНИЕ=ПОБУД ТИП_ГЛАГОЛА=СТАТИЧ ЧИСЛО=ЕД',
165
+ u'ПРЕДЛОГ ПАДЕЖ=ВИН ПАДЕЖ=МЕСТ ПАДЕЖ=ПРЕДЛ',
166
+ #u'ЧАСТИЦА'
167
+ ]),
168
+ (u'заводим', [u'ГЛАГОЛ ВИД=НЕСОВЕРШ ВРЕМЯ=НАСТОЯЩЕЕ ЛИЦО=1 НАКЛОНЕНИЕ=ИЗЪЯВ ПАДЕЖ=ВИН ПАДЕЖ=РОД ПАДЕЖ=ТВОР ЧИСЛО=МН'])
169
+ ]
170
+
171
+ for word, required_tagsets in cases:
172
+ model_tagsets = list(word2tags[word])
173
+ if len(model_tagsets) != len(required_tagsets):
174
+ #for tagset in model_tagsets:
175
+ # print(u'DEBUG@112 word={} tagset={}'.format(word, tagset))
176
+ raise AssertionError(u'word="{}": {} tagset(s) required, {} found'.format(word, len(required_tagsets), len(model_tagsets)))
177
+
178
+ for model_tagset in model_tagsets:
179
+ if model_tagset not in required_tagsets:
180
+ raise AssertionError(u'Predicted tagset "{}" for word "{}" is not valid'.format(model_tagset, word))
181
+
182
+ print('All tests PASSED.')
183
+
184
+
185
+ def normalize_word(s):
186
+ if len(s) > 2 and s[0] == "'" and s[-1] == "'":
187
+ s = s[1:-1]
188
+
189
+ return s.replace(' - ', '-').replace('ё', 'е').strip().lower()
190
+
191
+
192
+ ignore_tags = set('ПАДЕЖВАЛ:РОД МОДАЛЬНЫЙ:0 ПЕРЕЧИСЛИМОСТЬ:ДА ПЕРЕХОДНОСТЬ:ПЕРЕХОДНЫЙ ПЕРЕХОДНОСТЬ:НЕПЕРЕХОДНЫЙ ПАДЕЖВАЛ:ТВОР ПАДЕЖВАЛ:ИМ ПАДЕЖВАЛ:ДАТ ПАДЕЖВАЛ:ВИН СГД_ВРЕМЯ:Начать ВОЗВРАТНОСТЬ:0 ВОЗВРАТНОСТЬ:1'.split())
193
+
194
+
195
+ def clean_tagset(tagset):
196
+ return ' '.join(t for t in tagset.split() if t not in ignore_tags).replace(':', '=')
197
+
198
+
199
+ if __name__ == '__main__':
200
+ parser = argparse.ArgumentParser(description='Сборка грамматического словаря')
201
+ parser.add_argument('--src', type=str, default='../data/word2tags.dat', help='Source grammatical dictionary file path')
202
+ parser.add_argument('--output', type=str, default='../output/ruword2tags.dat', help='Result dictionary file path')
203
+ parser.add_argument('--words', type=str, help='List of known words (all dictionary words are included by default)')
204
+
205
+ args = parser.parse_args()
206
+ knownwords_file = args.words
207
+ word2tags_path = args.src
208
+ output_file = args.output
209
+
210
+ # Строим словарь из исходных данных
211
+
212
+ known_words = None
213
+ if knownwords_file is not None:
214
+ # Загружаем из указанного файла список слов, которые попадут в итоговую модель.
215
+ print('Загружаем список слов для сборки кастомного словаря из {}'.format(knownwords_file))
216
+ known_words = set()
217
+ with io.open(knownwords_file, 'r', encoding='utf-8') as rdr:
218
+ for line in rdr:
219
+ word = line.replace(chr(65279), '').strip()
220
+ known_words.add(word.lower())
221
+ print('Загружено {} слов из {}'.format(len(known_words), knownwords_file))
222
+
223
+ word2tagsets = dict()
224
+ tagset2index = dict()
225
+ nb_words = 0
226
+ filter_negative_scores = True
227
+ print('Loading dictionary from {}'.format(word2tags_path))
228
+
229
+ # В первом проходе по списку словоформ отберем формы, которые будем игнорировать из-за присвоенной
230
+ # им частоты < 0. Если все варианты распознавания слова имеют присвоенную частоту < 0, то не будем отсекать
231
+ # такие формы.
232
+ wordform2max_score = dict()
233
+ with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
234
+ for line in rdr:
235
+ tx = line.replace(chr(65279), '').strip().split('\t')
236
+ if len(tx) == 5:
237
+ score = int(tx[4])
238
+ word = normalize_word(tx[0])
239
+ wordform2max_score[word] = max(score, wordform2max_score.get(word, -1000000))
240
+
241
+ # Основной, второй проход.
242
+ with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
243
+ for line in rdr:
244
+ tx = line.replace(chr(65279), '').strip().split('\t')
245
+ if len(tx) == 5:
246
+ word = normalize_word(tx[0])
247
+ if filter_negative_scores and wordform2max_score[word] >= 0 and int(tx[4]) < 0:
248
+ # пропускаем формы, которые помечены как редкие или неграмматические (частотность < 0),
249
+ # и для которых есть альтернативы с частотой >= 0.
250
+ continue
251
+
252
+ if known_words is None or word in known_words:
253
+ pos = tx[1]
254
+ lemma = normalize_word(tx[2])
255
+ tags = clean_tagset(tx[3]) if len(tx) == 5 else u''
256
+
257
+ tagset = (pos + ' ' + tags).strip()
258
+
259
+ if tagset not in tagset2index:
260
+ tagset2index[tagset] = len(tagset2index)
261
+
262
+ itagset = tagset2index[tagset]
263
+
264
+ if word not in word2tagsets:
265
+ word2tagsets[word] = [itagset]
266
+ else:
267
+ word2tagsets[word].append(itagset)
268
+
269
+ nb_words += 1
270
+
271
+ print('Number of wordentries={}'.format(nb_words))
272
+ print('Number of tagsets={}'.format(len(tagset2index)))
273
+
274
+ for word in u'а и у с к'.split():
275
+ assert(word in word2tagsets)
276
+
277
+ ending_lens = [3, 4, 5]
278
+ processed_words = set()
279
+ ending2tagsets = dict()
280
+ all_ending2tagsets = dict()
281
+
282
+ for ending_len in ending_lens:
283
+ print('Start processing ending_len={}'.format(ending_len))
284
+ e2tagsets = dict()
285
+ for word, tagsets in word2tagsets.items():
286
+ if word not in processed_words and len(word) > ending_len:
287
+ ending = word[-ending_len:]
288
+ if ending not in e2tagsets:
289
+ e2tagsets[ending] = set(tagsets)
290
+ else:
291
+ e2tagsets[ending].update(tagsets)
292
+
293
+ all_ending2tagsets.update(e2tagsets)
294
+ print('Number of distinct endings={}'.format(len(e2tagsets)))
295
+
296
+ # Уберем окончания, которые дают списки тегов хотя бы с 1 ошибкой
297
+ bad_endings = set()
298
+ for word, word_tagsets in word2tagsets.items():
299
+ if word not in processed_words and len(word) > ending_len:
300
+ ending = word[-ending_len:]
301
+ ending_tagsets = e2tagsets[ending]
302
+ if set(word_tagsets) != ending_tagsets:
303
+ bad_endings.add(ending)
304
+
305
+ print('Number of bad endings={}'.format(len(bad_endings)))
306
+
307
+ e2tagsets = dict(filter(lambda z: z[0] not in bad_endings, e2tagsets.items()))
308
+
309
+ # Теперь пометим слова, которые подходят под оставшиеся хорошие окончания.
310
+ nb_matched_words = 0
311
+ for word in word2tagsets.keys():
312
+ if len(word) > ending_len:
313
+ ending = word[-ending_len:]
314
+ if ending in e2tagsets:
315
+ processed_words.add(word)
316
+ nb_matched_words += 1
317
+
318
+ print('nb_matched_words={}'.format(nb_matched_words))
319
+
320
+ # Переносим оставшиеся хорошие ��кончания в основной список
321
+ ending2tagsets.update(e2tagsets)
322
+
323
+ print('Number of good endings={}'.format(len(ending2tagsets)))
324
+ print('Number of all endings={}'.format(len(all_ending2tagsets)))
325
+
326
+ print('Building TRIE for {} words...'.format(len(word2tagsets)))
327
+ trie_words = []
328
+ for word, word_tagsets in word2tagsets.items():
329
+ if word not in processed_words:
330
+ # Слово не было обработано окончаниями.
331
+ for itagset in word_tagsets:
332
+ trie_words.append((word, itagset))
333
+
334
+ trie_root = create_trie_node('')
335
+ for word, itagset in trie_words:
336
+ add_to_trie_node(trie_root, word, itagset)
337
+
338
+ print('Number of words in TRIE={}'.format(len(trie_words)))
339
+
340
+ index2tagset = dict((i, t) for (t, i) in tagset2index.items())
341
+
342
+ trie_tagsets = dict()
343
+ trie_root = trie_constructed(trie_root, trie_tagsets)
344
+
345
+ db_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags.db')
346
+ print('Writing "{}"...'.format(db_filepath))
347
+ with sqlite3.connect(db_filepath) as cnx:
348
+ cursor = cnx.cursor()
349
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='word_tagsets'")
350
+ if not cursor.fetchone():
351
+ cnx.execute('CREATE TABLE word_tagsets(word TEXT NOT NULL PRIMARY KEY, id_tagsets INT not null)')
352
+ else:
353
+ cnx.execute('DELETE FROM word_tagsets')
354
+
355
+ for word, word_tagsets in word2tagsets.items():
356
+ if word not in processed_words:
357
+ tagsets2 = tuple(sorted(word_tagsets))
358
+ id_tagsets = trie_tagsets[tagsets2]
359
+ cursor.execute("INSERT INTO word_tagsets(word, id_tagsets) VALUES(:word, :tagsets)",
360
+ {'word': word, 'tagsets': id_tagsets})
361
+
362
+ cnx.commit()
363
+
364
+ lexicon_data = {'ending_lens': ending_lens,
365
+ 'index2tagset': index2tagset,
366
+ 'ending2tagsets': ending2tagsets,
367
+ 'all_ending2tagsets': all_ending2tagsets,
368
+ 'id2tagsets': dict((id, tagsets) for (tagsets, id) in trie_tagsets.items())
369
+ }
370
+
371
+ print('Writing "{}"...'.format(output_file))
372
+ with open(output_file, 'wb') as f:
373
+ pickle.dump(lexicon_data, f, protocol=2)
374
+
375
+ trie_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags_trie.dat')
376
+ print('Writing "{}"...'.format(trie_filepath))
377
+ with gzip.open(trie_filepath, 'wb') as f:
378
+ pickle.dump(trie_root, f)
379
+
380
+ #print('Сохранен файл словаря размером {:d} Мб'.format(int(os.path.getsize(output_file)/1000000)))
381
+ print('All data stored.')
382
+
383
+ # Теперь запускаем проверки для построенного словаря
384
+ run_tests(output_file)
385
+
386
+ word2tags = RuWord2Tags()
387
+ word2tags.load(output_file)
388
+
389
+ for word in u'кошки ккошки на'.split():
390
+ for i, tagset in enumerate(word2tags[word]):
391
+ print(u'{}[{}] => {}'.format(word, i, tagset))
nn/nn_accent/big.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e69d9ae19f2a82e21b1c70f6a4bbfb1abc5759e98b2e67d009c5e9d7af18c9
3
+ size 2285217
nn/nn_accent/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "onnx_out",
3
+ "architectures": [
4
+ "RoFormerForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.2,
7
+ "embedding_size": 128,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.2,
10
+ "hidden_size": 128,
11
+ "id2label": {
12
+ "0": "NO",
13
+ "1": "STRESS_PRIMARY",
14
+ "2": "STRESS_SECONDARY"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 256,
18
+ "label2id": {
19
+ "NO": 0,
20
+ "STRESS_PRIMARY": 1,
21
+ "STRESS_SECONDARY": 2
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_length": 40,
25
+ "max_position_embeddings": 60,
26
+ "max_relative_positions": 60,
27
+ "model_type": "roformer",
28
+ "num_attention_heads": 8,
29
+ "num_hidden_layers": 4,
30
+ "pad_token_id": 0,
31
+ "relative_attention": true,
32
+ "rotary_value": false,
33
+ "transformers_version": "4.29.2",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 45
37
+ }
nn/nn_accent/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e393144e45626f6f1062a0784ef06f921b97321a8e7b87ac2a09a892286500a
3
+ size 803402
nn/nn_accent/ort_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "one_external_file": true,
3
+ "opset": null,
4
+ "optimization": {},
5
+ "optimum_version": "1.8.5",
6
+ "quantization": {
7
+ "activations_dtype": "QUInt8",
8
+ "activations_symmetric": false,
9
+ "format": "QOperator",
10
+ "is_static": false,
11
+ "mode": "IntegerOps",
12
+ "nodes_to_exclude": [],
13
+ "nodes_to_quantize": [],
14
+ "operators_to_quantize": [
15
+ "MatMul",
16
+ "Add"
17
+ ],
18
+ "per_channel": false,
19
+ "qdq_add_pair_to_weight": false,
20
+ "qdq_dedicated_pair": false,
21
+ "qdq_op_type_per_channel_support_to_axis": {
22
+ "MatMul": 1
23
+ },
24
+ "reduce_range": false,
25
+ "weights_dtype": "QInt8",
26
+ "weights_symmetric": true
27
+ },
28
+ "transformers_version": "4.29.2",
29
+ "use_external_data_format": false
30
+ }
nn/nn_accent/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[bos]",
3
+ "eos_token": "[eos]",
4
+ "pad_token": "[pad]",
5
+ "unk_token": "[unk]"
6
+ }
nn/nn_accent/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[bos]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": true,
5
+ "eos_token": "[eos]",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[pad]",
8
+ "tokenizer_class": "CharTokenizer",
9
+ "unk_token": "[unk]"
10
+ }
nn/nn_accent/vocab.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [pad]
2
+ [unk]
3
+ [bos]
4
+ [eos]
5
+ '
6
+ -
7
+ .
8
+ ?
9
+ `
10
+ c
11
+ e
12
+ ́
13
+ а
14
+ б
15
+ в
16
+ г
17
+ д
18
+ е
19
+ ж
20
+ з
21
+ и
22
+ й
23
+ к
24
+ л
25
+ м
26
+ н
27
+ о
28
+ п
29
+ р
30
+ с
31
+ т
32
+ у
33
+ ф
34
+ х
35
+ ц
36
+ ч
37
+ ш
38
+ щ
39
+ ъ
40
+ ы
41
+ ь
42
+ э
43
+ ю
44
+ я
45
+ ё
nn/nn_omograph/big_poetry/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</w>": 120139,
3
+ "<w>": 120138
4
+ }
nn/nn_omograph/big_poetry/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "rubert_base/",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "position_embedding_type": "absolute",
26
+ "problem_type": "single_label_classification",
27
+ "transformers_version": "4.29.2",
28
+ "type_vocab_size": 2,
29
+ "use_cache": true,
30
+ "vocab_size": 120140
31
+ }
nn/nn_omograph/big_poetry/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7d1d58e5ad908f4187d3c44f640106b721e293ec954c9c4603abc25ba5f7e8a
3
+ size 713508364
nn/nn_omograph/big_poetry/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
nn/nn_omograph/big_poetry/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nn/nn_omograph/big_poetry/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
nn/nn_omograph/big_poetry/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
nn/nn_omograph/medium_poetry/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</w>": 64001,
3
+ "<w>": 64000
4
+ }
nn/nn_omograph/medium_poetry/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "SRUElectra-medium/checkpoint-4500000/",
3
+ "architectures": [
4
+ "ElectraForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 576,
9
+ "generator_size": "0.25",
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2304,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 512,
17
+ "model_type": "electra",
18
+ "num_attention_heads": 9,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "problem_type": "single_label_classification",
23
+ "summary_activation": "gelu",
24
+ "summary_last_dropout": 0.1,
25
+ "summary_type": "first",
26
+ "summary_use_proj": true,
27
+ "transformers_version": "4.29.2",
28
+ "type_vocab_size": 2,
29
+ "use_cache": true,
30
+ "vocab_size": 64002
31
+ }
nn/nn_omograph/medium_poetry/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:689752e4bff9eb0b8837482d9ea724f72356aab19822c2e4ae3de6b5a2fc08b1
3
+ size 341725861
nn/nn_omograph/medium_poetry/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
nn/nn_omograph/medium_poetry/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nn/nn_omograph/medium_poetry/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "ElectraTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
nn/nn_omograph/medium_poetry/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
nn/nn_omograph/small_poetry/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</w>": 30523,
3
+ "<w>": 30522
4
+ }
nn/nn_omograph/small_poetry/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "output/checkpoint-440000/",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 264,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 792,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 3,
16
+ "pad_token_id": 0,
17
+ "problem_type": "single_label_classification",
18
+ "qa_dropout": 0.1,
19
+ "seq_classif_dropout": 0.2,
20
+ "sinusoidal_pos_embds": false,
21
+ "transformers_version": "4.29.2",
22
+ "vocab_size": 30524
23
+ }
nn/nn_omograph/small_poetry/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcea1b8d8c164276d2e593d53261ca3c21d6fc9fed4f04abb8f69e2b95ba842d
3
+ size 41532079
nn/nn_omograph/small_poetry/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
nn/nn_omograph/small_poetry/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nn/nn_omograph/small_poetry/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "DistilBertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
nn/nn_omograph/small_poetry/vocab.txt ADDED
Binary file (382 kB). View file
 
nn/nn_omograph/turbo/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</w>": 50257,
3
+ "<w>": 50256
4
+ }
nn/nn_omograph/turbo/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "rudeberta_distilled/checkpoint-220000/",
3
+ "architectures": [
4
+ "DebertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "pooler_dropout": 0,
20
+ "pooler_hidden_act": "gelu",
21
+ "pooler_hidden_size": 768,
22
+ "pos_att_type": null,
23
+ "position_biased_input": true,
24
+ "relative_attention": false,
25
+ "transformers_version": "4.28.1",
26
+ "type_vocab_size": 0,
27
+ "vocab_size": 50258
28
+ }
nn/nn_omograph/turbo/merges.txt ADDED
The diff for this file is too large to render. See raw diff