Upload 66 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- dictionary/accents.json.gz +3 -0
- dictionary/accents_nn.json.gz +3 -0
- dictionary/omographs.json.gz +3 -0
- dictionary/rule_engine/accents.json +0 -0
- dictionary/rule_engine/forms.json +85 -0
- dictionary/yo_homographs.json.gz +3 -0
- dictionary/yo_omographs.json.gz +3 -0
- dictionary/yo_words.json.gz +3 -0
- koziev/rulemma/rulemma.dat +3 -0
- koziev/rulemma/rulemma.py +237 -0
- koziev/rupostagger/__init__.py +3 -0
- koziev/rupostagger/database/ruword2tags.db +3 -0
- koziev/rupostagger/rupostagger.config +11 -0
- koziev/rupostagger/rupostagger.model +3 -0
- koziev/rupostagger/rupostagger.py +173 -0
- koziev/rupostagger/rusyllab.py +589 -0
- koziev/rupostagger/ruword2tags.dat +3 -0
- koziev/rupostagger/ruword2tags.py +391 -0
- nn/nn_accent/big.onnx +3 -0
- nn/nn_accent/config.json +37 -0
- nn/nn_accent/model.onnx +3 -0
- nn/nn_accent/ort_config.json +30 -0
- nn/nn_accent/special_tokens_map.json +6 -0
- nn/nn_accent/tokenizer_config.json +10 -0
- nn/nn_accent/vocab.txt +45 -0
- nn/nn_omograph/big_poetry/added_tokens.json +4 -0
- nn/nn_omograph/big_poetry/config.json +31 -0
- nn/nn_omograph/big_poetry/model.onnx +3 -0
- nn/nn_omograph/big_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/big_poetry/tokenizer.json +0 -0
- nn/nn_omograph/big_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/big_poetry/vocab.txt +0 -0
- nn/nn_omograph/medium_poetry/added_tokens.json +4 -0
- nn/nn_omograph/medium_poetry/config.json +31 -0
- nn/nn_omograph/medium_poetry/model.onnx +3 -0
- nn/nn_omograph/medium_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/medium_poetry/tokenizer.json +0 -0
- nn/nn_omograph/medium_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/medium_poetry/vocab.txt +0 -0
- nn/nn_omograph/small_poetry/added_tokens.json +4 -0
- nn/nn_omograph/small_poetry/config.json +23 -0
- nn/nn_omograph/small_poetry/model.onnx +3 -0
- nn/nn_omograph/small_poetry/special_tokens_map.json +7 -0
- nn/nn_omograph/small_poetry/tokenizer.json +0 -0
- nn/nn_omograph/small_poetry/tokenizer_config.json +15 -0
- nn/nn_omograph/small_poetry/vocab.txt +0 -0
- nn/nn_omograph/turbo/added_tokens.json +4 -0
- nn/nn_omograph/turbo/config.json +28 -0
- nn/nn_omograph/turbo/merges.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
koziev/rulemma/rulemma.dat filter=lfs diff=lfs merge=lfs -text
|
37 |
+
koziev/rupostagger/database/ruword2tags.db filter=lfs diff=lfs merge=lfs -text
|
38 |
+
koziev/rupostagger/ruword2tags.dat filter=lfs diff=lfs merge=lfs -text
|
dictionary/accents.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa460ebba90de00fbbf3d41d121961f605b98667e45efb7920f127473b15515e
|
3 |
+
size 20954156
|
dictionary/accents_nn.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8395664000b80c1afe09bfea3650945b0933482b8e3dee5bb9d429eb18c44935
|
3 |
+
size 845996
|
dictionary/omographs.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04a9e81c68d65f65ba493fe0110f99e79087548c2beeec3032e2b66e28706f36
|
3 |
+
size 219047
|
dictionary/rule_engine/accents.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dictionary/rule_engine/forms.json
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"diminutive": "",
|
3 |
+
"perfective/imperfective": "Aspect=Perf|Aspect=Imp",
|
4 |
+
"dative/prepositional": "Case=Dat|Case=Prep",
|
5 |
+
"inanimate": "Animacy=Inan",
|
6 |
+
"animate/inanimate": "Animacy=Anim|Animacy=Inan",
|
7 |
+
"dative": "Case=Dat",
|
8 |
+
"second-person": "Person=2",
|
9 |
+
"imperative": "Mood=Imp",
|
10 |
+
"archaic": "",
|
11 |
+
"nominative": "Case=Nom",
|
12 |
+
"locative": "Case=Loc",
|
13 |
+
"masculine": "Gender=Masc",
|
14 |
+
"female": "",
|
15 |
+
"canonical": "",
|
16 |
+
"plural": "Number=Plur",
|
17 |
+
"short": "Variant=Short",
|
18 |
+
"imperfective": "Aspect=Imp",
|
19 |
+
"form": "",
|
20 |
+
"augmentative": "",
|
21 |
+
"masculine/feminine": "Gender=Masc|Gender=Fem",
|
22 |
+
"superlative": "Degree=Sup",
|
23 |
+
"nominative/accusative": "Case=Nom|Case=Acc",
|
24 |
+
"third-person": "Person=3",
|
25 |
+
"nonstandard": "",
|
26 |
+
"genitive": "Case=Gen",
|
27 |
+
"feminine": "Gender=Fem",
|
28 |
+
"masculine/neuter": "Gender=Masc|Gender=Neut",
|
29 |
+
"dative/locative": "Case=Dat|Case=Loc",
|
30 |
+
"genitive/accusative/prepositional": "Case=Gen|Case=Acc|Case=Prep",
|
31 |
+
"partitive": "Case=Par",
|
32 |
+
"genitive/prepositional": "Case=Gen|Case=Prep",
|
33 |
+
"equivalent": "",
|
34 |
+
"endearing": "",
|
35 |
+
"degree": "Degree=",
|
36 |
+
"comparative": "Degree=Cmp",
|
37 |
+
"imperfective/perfective": "Aspect=Imp|Aspect=Perf",
|
38 |
+
"mainly": "",
|
39 |
+
"passive": "Voice=Pass",
|
40 |
+
"first-person": "Person=1",
|
41 |
+
"perfective": "Aspect=Perf",
|
42 |
+
"genitive/dative/instrumental/prepositional": "Case=Gen|Case=Dat|Case=Ins|Case=Prep",
|
43 |
+
"pejorative": "",
|
44 |
+
"accusative": "Case=Acc",
|
45 |
+
"spelling": "",
|
46 |
+
"dative/partitive": "Case=Dat|Case=Par",
|
47 |
+
"old-fashion": "",
|
48 |
+
"possessive": "Poss=Yes",
|
49 |
+
"dative/instrumental": "Case=Dat|Case=Ins",
|
50 |
+
"adverbial": "",
|
51 |
+
"neuter": "Gender=Neut",
|
52 |
+
"future": "Tense=Fut",
|
53 |
+
"neuter/masculine": "Gender=Neut|Gender=Masc",
|
54 |
+
"inanimate/animate": "Animacy=Inan|Animacy=Anim",
|
55 |
+
"(singular": "Number=Sing",
|
56 |
+
"alternative,": "",
|
57 |
+
"participle": "VerbForm=Part",
|
58 |
+
"genitive/accusative": "Case=Gen|Case=Acc",
|
59 |
+
"indicative": "Mood=Ind",
|
60 |
+
"dative/accusative": "Case=Dat|Case=Acc",
|
61 |
+
"singular/plural": "Number=Sing|Number=Plur",
|
62 |
+
"instrumental": "Case=Ins",
|
63 |
+
"&": "",
|
64 |
+
"vocative": "Case=Voc",
|
65 |
+
"prepositional": "Case=Prep",
|
66 |
+
"active": "Voice=Act",
|
67 |
+
"inanimate/animate": "Animacy=Inan|Animacy=Anim",
|
68 |
+
"past": "Tense=Past",
|
69 |
+
"first/second/third-person": "Person=1|Person=2|Person=3",
|
70 |
+
"second-personal": "Person=2",
|
71 |
+
"reflexive": "Reflex=Yes",
|
72 |
+
"singular": "Number=Sing",
|
73 |
+
"accusative/genitive": "Case=Acc|Case=Gen",
|
74 |
+
"acronym": "",
|
75 |
+
"(animated)": "Animacy=Anim",
|
76 |
+
"euphemistic": "",
|
77 |
+
"genitive/dative/prepositional": "Case=Gen|Case=Dat|Case=Prep",
|
78 |
+
"colloquial": "",
|
79 |
+
"a": "",
|
80 |
+
"initialism": "",
|
81 |
+
"present": "Tense=Pres",
|
82 |
+
"obsolete": "",
|
83 |
+
"singulative": "",
|
84 |
+
"animate": "Animacy=Anim"
|
85 |
+
}
|
dictionary/yo_homographs.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4ee777bbbab87f9eac838f370ad92974e079d02b21903e480c54b5f0c8c60d1
|
3 |
+
size 5747
|
dictionary/yo_omographs.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b91cc78dacb5a43e4d5e2e62efdbe5a57799195e5868db35282bee0d9e215a0d
|
3 |
+
size 7949
|
dictionary/yo_words.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a19fa89a964a0691d9fe4ee384783e3934904891843d8f59a1c480d67947a82a
|
3 |
+
size 548914
|
koziev/rulemma/rulemma.dat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf2b3ef3ff7a0aa6e4250aa4e9c8ed568e25f825deebdb12dee1b46b785ba9fc
|
3 |
+
size 16703198
|
koziev/rulemma/rulemma.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Лемматизатор для R&D прототипирования NLP задач в Питоне
|
4 |
+
25.03.2020 добавлена ефикация в get_lemma2
|
5 |
+
05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT
|
6 |
+
"""
|
7 |
+
|
8 |
+
from __future__ import division
|
9 |
+
from __future__ import print_function
|
10 |
+
|
11 |
+
import os
|
12 |
+
import pickle
|
13 |
+
import pathlib
|
14 |
+
import gzip
|
15 |
+
|
16 |
+
|
17 |
+
def decode_pos(pos):
|
18 |
+
if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']:
|
19 |
+
return u'ГЛАГОЛ'
|
20 |
+
else:
|
21 |
+
return pos
|
22 |
+
|
23 |
+
|
24 |
+
class Lemmatizer(object):
|
25 |
+
def __init__(self):
|
26 |
+
pass
|
27 |
+
|
28 |
+
def load(self, dict_path=None):
|
29 |
+
""" Загружаем модель лемматизации, созданную отдельным скриптом builder.py """
|
30 |
+
dict_filename = 'rulemma.dat'
|
31 |
+
if dict_path is None:
|
32 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
33 |
+
p = os.path.join(module_folder, '../tmp', dict_filename)
|
34 |
+
if not os.path.exists(p):
|
35 |
+
p = os.path.join(module_folder, dict_filename)
|
36 |
+
else:
|
37 |
+
p = dict_path
|
38 |
+
|
39 |
+
with gzip.open(p, 'r') as f:
|
40 |
+
self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f)
|
41 |
+
|
42 |
+
def get_lemma(self, word):
|
43 |
+
if word in self.forms:
|
44 |
+
return self.forms[word]
|
45 |
+
elif word in self.forms2:
|
46 |
+
return self.forms2[word][0]
|
47 |
+
elif word in self.special_lemmas:
|
48 |
+
return self.special_lemmas[word]
|
49 |
+
else:
|
50 |
+
return word
|
51 |
+
|
52 |
+
def decode_pos_tags(self, pos_tags):
|
53 |
+
stags1 = []
|
54 |
+
part_of_speech = u'unk'
|
55 |
+
short_tag_index = -1
|
56 |
+
for tag in pos_tags.split('|'):
|
57 |
+
if tag == 'NOUN':
|
58 |
+
part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ'
|
59 |
+
elif tag == 'VERB':
|
60 |
+
part_of_speech = u'ГЛАГОЛ'
|
61 |
+
elif tag == 'ADJ':
|
62 |
+
part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ'
|
63 |
+
stags1.append((u'КРАТКИЙ', u'0'))
|
64 |
+
short_tag_index = 0
|
65 |
+
elif tag == 'ADV':
|
66 |
+
part_of_speech = u'НАРЕЧИЕ'
|
67 |
+
elif tag == 'PRON':
|
68 |
+
part_of_speech = u'МЕСТОИМЕНИЕ'
|
69 |
+
elif tag == 'ADP':
|
70 |
+
part_of_speech = u'ПРЕДЛОГ'
|
71 |
+
elif tag == 'CONJ':
|
72 |
+
part_of_speech = u'СОЮЗ'
|
73 |
+
elif tag == 'PART':
|
74 |
+
part_of_speech = u'ЧАСТИЦА'
|
75 |
+
elif tag == 'PUNCT':
|
76 |
+
part_of_speech = u'ПУНКТУАТОР'
|
77 |
+
elif '=' in tag:
|
78 |
+
if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ':
|
79 |
+
if tag == u'Case=Nom':
|
80 |
+
stags1.append((u'ПАДЕЖ', u'ИМ'))
|
81 |
+
elif tag == u'Case=Acc':
|
82 |
+
stags1.append((u'ПАДЕЖ', u'ВИН'))
|
83 |
+
elif tag == u'Case=Dat':
|
84 |
+
stags1.append((u'ПАДЕЖ', u'ДАТ'))
|
85 |
+
elif tag == u'Case=Ins':
|
86 |
+
stags1.append((u'ПАДЕЖ', u'ТВОР'))
|
87 |
+
elif tag == u'Case=Prep':
|
88 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
|
89 |
+
elif tag == u'Case=Loc':
|
90 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
|
91 |
+
elif tag == u'Case=Gen':
|
92 |
+
stags1.append((u'ПАДЕЖ', u'РОД'))
|
93 |
+
elif tag == u'Case=Voc':
|
94 |
+
stags1.append((u'ПАДЕЖ', u'ЗВАТ'))
|
95 |
+
elif tag == u'Number=Sing':
|
96 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
97 |
+
elif tag == u'Number=Plur':
|
98 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
99 |
+
elif tag == u'Gender=Masc':
|
100 |
+
stags1.append((u'РОД', u'МУЖ'))
|
101 |
+
elif tag == u'Gender=Fem':
|
102 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
103 |
+
elif tag == u'Gender=Neut':
|
104 |
+
stags1.append((u'РОД', u'СР'))
|
105 |
+
else:
|
106 |
+
print(u'неизвестный тэг "{}"'.format(tag))
|
107 |
+
raise NotImplementedError()
|
108 |
+
elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ':
|
109 |
+
if tag == u'Case=Nom':
|
110 |
+
stags1.append((u'ПАДЕЖ', u'ИМ'))
|
111 |
+
elif tag == u'Case=Acc':
|
112 |
+
stags1.append((u'ПАДЕЖ', u'ВИН'))
|
113 |
+
elif tag == u'Case=Dat':
|
114 |
+
stags1.append((u'ПАДЕЖ', u'ДАТ'))
|
115 |
+
elif tag == u'Case=Ins':
|
116 |
+
stags1.append((u'ПАДЕЖ', u'ТВОР'))
|
117 |
+
elif tag == u'Case=Prep':
|
118 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
|
119 |
+
elif tag == u'Case=Loc':
|
120 |
+
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
|
121 |
+
elif tag == u'Case=Gen':
|
122 |
+
stags1.append((u'ПАДЕЖ', u'РОД'))
|
123 |
+
elif tag == u'Number=Sing':
|
124 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
125 |
+
elif tag == u'Number=Plur':
|
126 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
127 |
+
elif tag == u'Gender=Masc':
|
128 |
+
stags1.append((u'РОД', u'МУЖ'))
|
129 |
+
elif tag == u'Gender=Fem':
|
130 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
131 |
+
elif tag == u'Gender=Neut':
|
132 |
+
stags1.append((u'РОД', u'СР'))
|
133 |
+
elif tag == u'Degree=Cmp':
|
134 |
+
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
|
135 |
+
elif tag == u'Degree=Pos':
|
136 |
+
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
|
137 |
+
elif tag in (u'Variant=Short', u'Variant=Brev'):
|
138 |
+
stags1[short_tag_index] = (u'КРАТКИЙ', u'1')
|
139 |
+
else:
|
140 |
+
print(u'неизвестный тэг "{}"'.format(tag))
|
141 |
+
raise NotImplementedError()
|
142 |
+
elif part_of_speech == u'ГЛАГОЛ':
|
143 |
+
if tag == u'Number=Sing':
|
144 |
+
stags1.append((u'ЧИСЛО', u'ЕД'))
|
145 |
+
elif tag == u'Number=Plur':
|
146 |
+
stags1.append((u'ЧИСЛО', u'МН'))
|
147 |
+
elif tag == u'Gender=Masc':
|
148 |
+
stags1.append((u'РОД', u'МУЖ'))
|
149 |
+
elif tag == u'Gender=Fem':
|
150 |
+
stags1.append((u'РОД', u'ЖЕН'))
|
151 |
+
elif tag == u'Gender=Neut':
|
152 |
+
stags1.append((u'РОД', u'СР'))
|
153 |
+
elif tag == u'Mood=Ind':
|
154 |
+
stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ'))
|
155 |
+
elif tag == u'Mood=Imp':
|
156 |
+
stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД'))
|
157 |
+
elif tag == u'Tense=Past':
|
158 |
+
stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ'))
|
159 |
+
elif tag == u'Tense=Fut':
|
160 |
+
stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ'))
|
161 |
+
elif tag == u'Tense=Notpast':
|
162 |
+
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
|
163 |
+
elif tag == u'Tense=Pres':
|
164 |
+
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
|
165 |
+
elif tag == u'Person=1':
|
166 |
+
stags1.append((u'ЛИЦО', u'1'))
|
167 |
+
elif tag == u'Person=2':
|
168 |
+
stags1.append((u'ЛИЦО', u'2'))
|
169 |
+
elif tag == u'Person=3':
|
170 |
+
stags1.append((u'ЛИЦО', u'3'))
|
171 |
+
elif tag == u'VerbForm=Fin':
|
172 |
+
pass
|
173 |
+
elif tag == u'VerbForm=Inf':
|
174 |
+
pass
|
175 |
+
elif tag == u'VerbForm=Conv':
|
176 |
+
pass
|
177 |
+
else:
|
178 |
+
msg = u'неизвестный тэг "{}"'.format(tag)
|
179 |
+
raise RuntimeError(msg)
|
180 |
+
elif part_of_speech == u'НАРЕЧИЕ':
|
181 |
+
if tag == u'Degree=Pos':
|
182 |
+
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
|
183 |
+
elif tag == u'Degree=Cmp':
|
184 |
+
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
|
185 |
+
else:
|
186 |
+
raise NotImplementedError()
|
187 |
+
else:
|
188 |
+
pass
|
189 |
+
|
190 |
+
return part_of_speech, stags1
|
191 |
+
|
192 |
+
def get_lemma2(self, word, pos_tags):
|
193 |
+
part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags)
|
194 |
+
|
195 |
+
nword = word.lower().replace('ё', 'е')
|
196 |
+
|
197 |
+
if nword in self.special_lemmas:
|
198 |
+
return self.special_lemmas[nword], part_of_speech, decoded_tags
|
199 |
+
|
200 |
+
if nword in self.forms:
|
201 |
+
lemma = self.forms[nword]
|
202 |
+
return lemma, part_of_speech, decoded_tags
|
203 |
+
elif nword in self.forms2:
|
204 |
+
if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ':
|
205 |
+
# Для существительных учитываем падеж.
|
206 |
+
required_case = None
|
207 |
+
for tag in decoded_tags:
|
208 |
+
if tag[0] == 'ПАДЕЖ':
|
209 |
+
required_case = tag[1]
|
210 |
+
break
|
211 |
+
|
212 |
+
for lemma, lemma_part_of_speech, tag in self.forms2[nword]:
|
213 |
+
if lemma_part_of_speech == part_of_speech and tag == required_case:
|
214 |
+
return lemma, part_of_speech, decoded_tags
|
215 |
+
else:
|
216 |
+
for lemma, lemma_part_of_speech, tags in self.forms2[nword]:
|
217 |
+
if lemma_part_of_speech == part_of_speech:
|
218 |
+
return lemma, part_of_speech, decoded_tags
|
219 |
+
elif len(word) > 4:
|
220 |
+
# используем модель лемматизации для OV-слов
|
221 |
+
ending = nword[-4:]
|
222 |
+
key = ending + u'|' + part_of_speech
|
223 |
+
if key in self.key2transducer:
|
224 |
+
transducer = self.key2transducer[key]
|
225 |
+
if transducer[0] > 0:
|
226 |
+
lemma = word[:-transducer[0]] + transducer[1]
|
227 |
+
else:
|
228 |
+
lemma = word + transducer[1]
|
229 |
+
|
230 |
+
return lemma.lower(), part_of_speech, decoded_tags
|
231 |
+
|
232 |
+
# fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы
|
233 |
+
return nword, part_of_speech, decoded_tags
|
234 |
+
|
235 |
+
def lemmatize(self, tagged_words):
|
236 |
+
"""Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи"""
|
237 |
+
return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]
|
koziev/rupostagger/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import
|
2 |
+
from .rupostagger import RuPosTagger
|
3 |
+
from .rupostagger import run_tests
|
koziev/rupostagger/database/ruword2tags.db
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a06848e656bef642aafb4440c03554fa78f2f32dde92ea66f3f86ce9977b167e
|
3 |
+
size 168816640
|
koziev/rupostagger/rupostagger.config
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"w2v_filename": "w2v.CBOW=1_WIN=5_DIM=64.bin",
|
3 |
+
"wc2v_filename": "wordchar2vector.dat",
|
4 |
+
"winspan": 3,
|
5 |
+
"use_w2v": false,
|
6 |
+
"use_gren": true,
|
7 |
+
"use_syllabs": false,
|
8 |
+
"use_shingles": false,
|
9 |
+
"ending_len": 0,
|
10 |
+
"model_filename": "rupostagger.model"
|
11 |
+
}
|
koziev/rupostagger/rupostagger.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21b7b0bfd7427b5fdc1604052176db8aa3b139b3ce03be440cfce48536f8e5ef
|
3 |
+
size 2417464
|
koziev/rupostagger/rupostagger.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Модель частеречной разметки для русскоязычных текстов (проект https://github.com/Koziev/rupostagger)
|
4 |
+
03.08.2019 небольшой баг с нормализацией (замена "ё" на "е") перед поиском в грамматическом словаре
|
5 |
+
"""
|
6 |
+
|
7 |
+
from __future__ import print_function
|
8 |
+
from __future__ import division # for python2 compatibility
|
9 |
+
|
10 |
+
import os
|
11 |
+
import json
|
12 |
+
import pathlib
|
13 |
+
import re
|
14 |
+
|
15 |
+
import pycrfsuite
|
16 |
+
from .ruword2tags import RuWord2Tags
|
17 |
+
from .rusyllab import split_word
|
18 |
+
|
19 |
+
|
20 |
+
BEG_TOKEN = '<beg>'
|
21 |
+
END_TOKEN = '<end>'
|
22 |
+
|
23 |
+
token2tag = {BEG_TOKEN: BEG_TOKEN, END_TOKEN: END_TOKEN}
|
24 |
+
|
25 |
+
|
26 |
+
def is_num(token):
|
27 |
+
return re.match('^[0-9]+$', token)
|
28 |
+
|
29 |
+
|
30 |
+
class RuPosTagger(object):
|
31 |
+
def __init__(self):
|
32 |
+
self.winspan = -1
|
33 |
+
self.use_w2v = -1
|
34 |
+
self.use_syllabs = -1
|
35 |
+
self.ending_len = -1
|
36 |
+
self.word2tags = None
|
37 |
+
|
38 |
+
def load(self, word2tags_path=None):
|
39 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
40 |
+
data_folder = os.path.join(module_folder, '../tmp')
|
41 |
+
|
42 |
+
config_path = os.path.join(data_folder, 'rupostagger.config')
|
43 |
+
if not os.path.exists(config_path):
|
44 |
+
data_folder = module_folder
|
45 |
+
config_path = os.path.join(data_folder, 'rupostagger.config')
|
46 |
+
|
47 |
+
#print('DEBUG@47 module_folder={}'.format(module_folder))
|
48 |
+
#print('DEBUG@48 data_folder={}'.format(data_folder))
|
49 |
+
|
50 |
+
with open(config_path, 'r') as rdr:
|
51 |
+
self.config = json.load(rdr)
|
52 |
+
self.winspan = self.config['winspan']
|
53 |
+
self.use_gren = self.config['use_gren']
|
54 |
+
self.use_w2v = self.config['use_w2v']
|
55 |
+
self.use_syllabs = self.config['use_syllabs']
|
56 |
+
self.ending_len = self.config['ending_len']
|
57 |
+
|
58 |
+
self.word2tags = RuWord2Tags()
|
59 |
+
self.word2tags.load(word2tags_path)
|
60 |
+
|
61 |
+
model_path = os.path.join(data_folder, 'rupostagger.model')
|
62 |
+
self.tagger = pycrfsuite.Tagger()
|
63 |
+
self.tagger.open(model_path)
|
64 |
+
|
65 |
+
@staticmethod
|
66 |
+
def __normalize_word(word):
|
67 |
+
return word.replace(' - ', '-').replace(u'ё', u'е').lower()
|
68 |
+
|
69 |
+
def get_word_features(self, word, prefix):
|
70 |
+
assert(len(word) > 0)
|
71 |
+
features = []
|
72 |
+
if word in token2tag:
|
73 |
+
features.append((u'tag[{}]={}'.format(prefix, token2tag[word]), 1.0))
|
74 |
+
elif is_num(word):
|
75 |
+
features.append((u'tag[{}]=<num> tag[{}]=<num_{}>'.format(prefix, prefix, word[-1]), 1.0))
|
76 |
+
elif len(word) == 1 and word[0] in u'‼≠™®•·[¡+<>`~;.,‚?!-…№”“„{}|‹›/\'"–—_:«»*]()‘’≈':
|
77 |
+
features.append((u'tag[{}]=punct_{}'.format(prefix, ord(word[0])), 1.0))
|
78 |
+
else:
|
79 |
+
uword = self.__normalize_word(word)
|
80 |
+
first_char = word[0]
|
81 |
+
if first_char in u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
82 |
+
features.append((u'word[{}]=<latin>'.format(prefix), 1.0))
|
83 |
+
else:
|
84 |
+
if first_char in u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ':
|
85 |
+
features.append((u'word[{}]=<upper1>'.format(prefix), 1.0))
|
86 |
+
|
87 |
+
if self.ending_len > 0:
|
88 |
+
ending = '~' + uword[-self.ending_len:] if len(uword) > self.ending_len else uword
|
89 |
+
features.append((u'ending[{}]={}'.format(prefix, ending), 1.0))
|
90 |
+
|
91 |
+
if self.use_syllabs and first_char.lower() in u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя':
|
92 |
+
syllabs = split_word(uword)
|
93 |
+
if len(syllabs) > 0:
|
94 |
+
if len(syllabs) == 1:
|
95 |
+
features.append((u'slb[{}]={}'.format(prefix, syllabs[0] + '~'), 1.0))
|
96 |
+
else:
|
97 |
+
features.append((u'slb[{}]={}'.format(prefix, syllabs[0]+'~'), 1.0))
|
98 |
+
for s in syllabs[1:-1]:
|
99 |
+
features.append((u'slb[{}]={}'.format(prefix, '~'+s+'~'), 1.0))
|
100 |
+
features.append((u'slb[{}]={}'.format(prefix, '~'+syllabs[-1]), 1.0))
|
101 |
+
|
102 |
+
if self.use_gren:
|
103 |
+
tags = set()
|
104 |
+
for tagset in self.word2tags[uword]:
|
105 |
+
tags.update(tagset.split(' '))
|
106 |
+
|
107 |
+
for tag in tags:
|
108 |
+
features.append((u'tag[{}]={}'.format(prefix, tag), 1.0))
|
109 |
+
|
110 |
+
return features
|
111 |
+
|
112 |
+
def vectorize_sample(self, words):
|
113 |
+
lines2 = []
|
114 |
+
nb_words = len(words)
|
115 |
+
for iword, word in enumerate(words):
|
116 |
+
word_features = dict()
|
117 |
+
for j in range(-self.winspan, self.winspan + 1):
|
118 |
+
iword2 = iword + j
|
119 |
+
if iword2 < 0:
|
120 |
+
features = [('word[{}]=<beg>'.format(j), 1.0)]
|
121 |
+
elif iword2 >= nb_words:
|
122 |
+
features = [('word[{}]=<end>'.format(j), 1.0)]
|
123 |
+
else:
|
124 |
+
features = self.get_word_features(words[iword2], str(j))
|
125 |
+
word_features.update(features)
|
126 |
+
|
127 |
+
lines2.append(word_features)
|
128 |
+
|
129 |
+
return lines2
|
130 |
+
|
131 |
+
def tag(self, words):
|
132 |
+
#X = self.vectorize_sample([BEG_TOKEN]+words+[END_TOKEN])
|
133 |
+
X = self.vectorize_sample(words)
|
134 |
+
y_pred = self.tagger.tag(X)
|
135 |
+
#return zip(words, y_pred[1: -1])
|
136 |
+
return zip(words, y_pred)
|
137 |
+
|
138 |
+
|
139 |
+
def test1(tagger, phrase, required_labels):
|
140 |
+
pred_labels = list(tagger.tag(phrase.split()))
|
141 |
+
assert(len(required_labels.split()) == len(pred_labels))
|
142 |
+
for required_label, (word, pred_label) in zip(required_labels.split(), pred_labels):
|
143 |
+
for tag in required_label.split('|'):
|
144 |
+
if tag not in pred_label:
|
145 |
+
print(u'Error: phrase={} word={} required_label={} pred_label={}'.format(phrase, word, required_label, pred_label))
|
146 |
+
return False
|
147 |
+
|
148 |
+
return True
|
149 |
+
|
150 |
+
|
151 |
+
def run_tests():
|
152 |
+
tagger = RuPosTagger()
|
153 |
+
tagger.load()
|
154 |
+
|
155 |
+
for phrase, required_labels in [(u'Кошки спят', u'NOUN|Number=Plur|Case=Nom VERB|Mood=Ind|Number=Plur|Person=3|Tense=Notpast|VerbForm=Fin'),
|
156 |
+
(u'Я рою колодец', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
|
157 |
+
(u'Я мою окно', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
|
158 |
+
(u'Ира мыла окно', u'NOUN|Case=Nom VERB NOUN|Number=Sing|Case=Acc'),
|
159 |
+
(u'Возьми мою пилу', u'VERB ADJ|Case=Acc NOUN|Case=Acc'),
|
160 |
+
(u'рой колодец', u'VERB NOUN|Number=Sing|Case=Acc'),
|
161 |
+
(u'У меня живёт черепаха', u'ADP PRON VERB NOUN'),
|
162 |
+
(u'какую еду ты любишь ?', u'ADJ NOUN PRON VERB PUNCT')
|
163 |
+
]:
|
164 |
+
if not test1(tagger, phrase, required_labels):
|
165 |
+
print('Tests FAILED')
|
166 |
+
return
|
167 |
+
|
168 |
+
print('Tests PASSED OK')
|
169 |
+
|
170 |
+
|
171 |
+
if __name__ == '__main__':
|
172 |
+
run_tests()
|
173 |
+
|
koziev/rupostagger/rusyllab.py
ADDED
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# autogenerated 2019-01-19 10:52:09.746954
|
4 |
+
|
5 |
+
|
6 |
+
def V(c):
|
7 |
+
return c in u"АЕЁИОУЫЭЮЯаеёиоуыэюя"
|
8 |
+
|
9 |
+
|
10 |
+
def C(c):
|
11 |
+
return c in u"БВГДЖЗКЛМНПРСТФХЦЧШЩбвгджзклмнпрстфхцчшщ"
|
12 |
+
|
13 |
+
|
14 |
+
def S(c):
|
15 |
+
return c in u"Йй"
|
16 |
+
|
17 |
+
|
18 |
+
def M(c):
|
19 |
+
return c in u"ЪЬъь"
|
20 |
+
|
21 |
+
|
22 |
+
def BEG(c):
|
23 |
+
return c == u"["
|
24 |
+
|
25 |
+
|
26 |
+
def END(c):
|
27 |
+
return c == u"]"
|
28 |
+
|
29 |
+
|
30 |
+
def split(s):
|
31 |
+
cur_pos = 0
|
32 |
+
items = list(u"[" + s + u"]")
|
33 |
+
while cur_pos < len(items):
|
34 |
+
input_context = items[cur_pos:]
|
35 |
+
res = apply1(input_context)
|
36 |
+
if res is None:
|
37 |
+
cur_pos += 1
|
38 |
+
else:
|
39 |
+
items = items[:cur_pos] + res[0] + input_context[res[1]:]
|
40 |
+
cur_pos += res[2]
|
41 |
+
return items[1:-1]
|
42 |
+
|
43 |
+
|
44 |
+
def apply1(s):
|
45 |
+
if C(s[0]):
|
46 |
+
if V(s[1]):
|
47 |
+
if C(s[2]):
|
48 |
+
if V(s[3]):
|
49 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_1
|
50 |
+
|
51 |
+
if C(s[3]):
|
52 |
+
if V(s[4]):
|
53 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_5
|
54 |
+
|
55 |
+
if C(s[4]):
|
56 |
+
if C(s[5]):
|
57 |
+
if END(s[6]):
|
58 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_11
|
59 |
+
|
60 |
+
if not END(s[6]):
|
61 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_12
|
62 |
+
|
63 |
+
|
64 |
+
if V(s[5]):
|
65 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_36
|
66 |
+
|
67 |
+
if END(s[5]):
|
68 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_120
|
69 |
+
|
70 |
+
if M(s[5]):
|
71 |
+
if END(s[6]):
|
72 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_330
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
if END(s[4]):
|
77 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_52
|
78 |
+
|
79 |
+
if M(s[4]):
|
80 |
+
if END(s[5]):
|
81 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_76
|
82 |
+
|
83 |
+
if C(s[5]):
|
84 |
+
if V(s[6]):
|
85 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_250
|
86 |
+
|
87 |
+
|
88 |
+
if V(s[5]):
|
89 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_260
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
if END(s[3]):
|
94 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_6
|
95 |
+
|
96 |
+
if M(s[3]):
|
97 |
+
if C(s[4]):
|
98 |
+
if not END(s[5]):
|
99 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_13
|
100 |
+
|
101 |
+
if END(s[5]):
|
102 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_39
|
103 |
+
|
104 |
+
if C(s[5]):
|
105 |
+
if C(s[6]):
|
106 |
+
if END(s[7]):
|
107 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_350
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
if END(s[4]):
|
113 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_14
|
114 |
+
|
115 |
+
if V(s[4]):
|
116 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_20
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
if END(s[2]):
|
121 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_7
|
122 |
+
|
123 |
+
if S(s[2]):
|
124 |
+
if C(s[3]):
|
125 |
+
if V(s[4]):
|
126 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_8
|
127 |
+
|
128 |
+
if C(s[4]):
|
129 |
+
if END(s[5]):
|
130 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_9
|
131 |
+
|
132 |
+
|
133 |
+
if END(s[4]):
|
134 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_280
|
135 |
+
|
136 |
+
if M(s[4]):
|
137 |
+
if END(s[5]):
|
138 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_400
|
139 |
+
|
140 |
+
|
141 |
+
|
142 |
+
if END(s[3]):
|
143 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_10
|
144 |
+
|
145 |
+
return ([s[0]+s[1]+s[2]], 3, 1) # SYLLABER_64
|
146 |
+
|
147 |
+
if V(s[2]):
|
148 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_31
|
149 |
+
|
150 |
+
|
151 |
+
if C(s[1]):
|
152 |
+
if C(s[2]):
|
153 |
+
if V(s[3]):
|
154 |
+
if C(s[4]):
|
155 |
+
if C(s[5]):
|
156 |
+
if V(s[6]):
|
157 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_2
|
158 |
+
|
159 |
+
if M(s[6]):
|
160 |
+
if END(s[7]):
|
161 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_310
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
if END(s[5]):
|
166 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_3
|
167 |
+
|
168 |
+
if V(s[5]):
|
169 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_4
|
170 |
+
|
171 |
+
if M(s[5]):
|
172 |
+
if C(s[6]):
|
173 |
+
if M(s[7]):
|
174 |
+
if END(s[8]):
|
175 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6]+s[7], s[8]], 9, 1) # SYLLABER_300
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_200
|
180 |
+
|
181 |
+
|
182 |
+
if S(s[4]):
|
183 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_54
|
184 |
+
|
185 |
+
if V(s[4]):
|
186 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_68
|
187 |
+
|
188 |
+
if END(s[4]):
|
189 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_170
|
190 |
+
|
191 |
+
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_210
|
192 |
+
|
193 |
+
if C(s[3]):
|
194 |
+
if V(s[4]):
|
195 |
+
if S(s[5]):
|
196 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) # SYLLABER_220
|
197 |
+
|
198 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) # SYLLABER_98
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
if V(s[2]):
|
203 |
+
if C(s[3]):
|
204 |
+
if C(s[4]):
|
205 |
+
if V(s[5]):
|
206 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_15
|
207 |
+
|
208 |
+
if C(s[5]):
|
209 |
+
if C(s[6]):
|
210 |
+
if END(s[7]):
|
211 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_370
|
212 |
+
|
213 |
+
|
214 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_80
|
215 |
+
|
216 |
+
if M(s[5]):
|
217 |
+
if V(s[6]):
|
218 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) # SYLLABER_340
|
219 |
+
|
220 |
+
if C(s[6]):
|
221 |
+
if V(s[7]):
|
222 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_390
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
if END(s[5]):
|
227 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_470
|
228 |
+
|
229 |
+
|
230 |
+
if M(s[4]):
|
231 |
+
if not C(s[5]):
|
232 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_21
|
233 |
+
|
234 |
+
if C(s[5]):
|
235 |
+
if V(s[6]):
|
236 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) # SYLLABER_48
|
237 |
+
|
238 |
+
if C(s[6]):
|
239 |
+
if V(s[7]):
|
240 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6], s[7]], 8, 1) # SYLLABER_240
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
+
if END(s[4]):
|
246 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_62
|
247 |
+
|
248 |
+
if V(s[4]):
|
249 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_230
|
250 |
+
|
251 |
+
|
252 |
+
if V(s[3]):
|
253 |
+
if C(s[4]):
|
254 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_17
|
255 |
+
|
256 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_82
|
257 |
+
|
258 |
+
if S(s[3]):
|
259 |
+
if END(s[4]):
|
260 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_33
|
261 |
+
|
262 |
+
if C(s[4]):
|
263 |
+
if V(s[5]):
|
264 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_92
|
265 |
+
|
266 |
+
if C(s[5]):
|
267 |
+
if C(s[6]):
|
268 |
+
if END(s[7]):
|
269 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) # SYLLABER_450
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) # SYLLABER_190
|
275 |
+
|
276 |
+
if END(s[3]):
|
277 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_66
|
278 |
+
|
279 |
+
|
280 |
+
if M(s[2]):
|
281 |
+
if V(s[3]):
|
282 |
+
if END(s[4]):
|
283 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_410
|
284 |
+
|
285 |
+
if C(s[4]):
|
286 |
+
if V(s[5]):
|
287 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_480
|
288 |
+
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
+
if M(s[1]):
|
294 |
+
if V(s[2]):
|
295 |
+
if C(s[3]):
|
296 |
+
if V(s[4]):
|
297 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_16
|
298 |
+
|
299 |
+
if C(s[4]):
|
300 |
+
if END(s[5]):
|
301 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_19
|
302 |
+
|
303 |
+
if V(s[5]):
|
304 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) # SYLLABER_290
|
305 |
+
|
306 |
+
if C(s[5]):
|
307 |
+
if C(s[6]):
|
308 |
+
if V(s[7]):
|
309 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) # SYLLABER_430
|
310 |
+
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
if END(s[4]):
|
315 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_22
|
316 |
+
|
317 |
+
|
318 |
+
if END(s[3]):
|
319 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_94
|
320 |
+
|
321 |
+
|
322 |
+
if C(s[2]):
|
323 |
+
if V(s[3]):
|
324 |
+
if S(s[4]):
|
325 |
+
if END(s[5]):
|
326 |
+
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) # SYLLABER_320
|
327 |
+
|
328 |
+
|
329 |
+
if V(s[4]):
|
330 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_360
|
331 |
+
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
|
336 |
+
|
337 |
+
if V(s[0]):
|
338 |
+
if C(s[1]):
|
339 |
+
if C(s[2]):
|
340 |
+
if END(s[3]):
|
341 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_18
|
342 |
+
|
343 |
+
if V(s[3]):
|
344 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_28
|
345 |
+
|
346 |
+
if C(s[3]):
|
347 |
+
if V(s[4]):
|
348 |
+
if C(s[5]):
|
349 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_96
|
350 |
+
|
351 |
+
return ([s[0]+s[1], s[2], s[3], s[4]], 5, 1) # SYLLABER_50
|
352 |
+
|
353 |
+
if C(s[4]):
|
354 |
+
if V(s[5]):
|
355 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_460
|
356 |
+
|
357 |
+
|
358 |
+
|
359 |
+
if M(s[3]):
|
360 |
+
if END(s[4]):
|
361 |
+
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) # SYLLABER_72
|
362 |
+
|
363 |
+
|
364 |
+
|
365 |
+
if V(s[2]):
|
366 |
+
return ([s[0], s[1], s[2]], 3, 1) # SYLLABER_35
|
367 |
+
|
368 |
+
if M(s[2]):
|
369 |
+
if END(s[3]):
|
370 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_40
|
371 |
+
|
372 |
+
if C(s[3]):
|
373 |
+
if C(s[4]):
|
374 |
+
if V(s[5]):
|
375 |
+
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) # SYLLABER_42
|
376 |
+
|
377 |
+
|
378 |
+
if V(s[4]):
|
379 |
+
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) # SYLLABER_84
|
380 |
+
|
381 |
+
|
382 |
+
if V(s[3]):
|
383 |
+
return ([s[0]+s[1]+s[2], s[3]], 4, 1) # SYLLABER_78
|
384 |
+
|
385 |
+
|
386 |
+
if END(s[2]):
|
387 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_44
|
388 |
+
|
389 |
+
return ([s[0]+s[1]], 2, 1) # SYLLABER_56
|
390 |
+
|
391 |
+
if END(s[1]):
|
392 |
+
return ([s[0], s[1]], 2, 1) # SYLLABER_30
|
393 |
+
|
394 |
+
if V(s[1]):
|
395 |
+
return ([s[0], s[1]], 2, 1) # SYLLABER_34
|
396 |
+
|
397 |
+
if S(s[1]):
|
398 |
+
if END(s[2]):
|
399 |
+
return ([s[0]+s[1], s[2]], 3, 1) # SYLLABER_46
|
400 |
+
|
401 |
+
if C(s[2]):
|
402 |
+
if V(s[3]):
|
403 |
+
return ([s[0]+s[1], s[2], s[3]], 4, 1) # SYLLABER_180
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
|
408 |
+
|
409 |
+
if BEG(s[0]):
|
410 |
+
if C(s[1]):
|
411 |
+
if C(s[2]):
|
412 |
+
if V(s[3]):
|
413 |
+
if C(s[4]):
|
414 |
+
if END(s[5]):
|
415 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_23
|
416 |
+
|
417 |
+
if C(s[5]):
|
418 |
+
if END(s[6]):
|
419 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_60
|
420 |
+
|
421 |
+
if M(s[6]):
|
422 |
+
if END(s[7]):
|
423 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_74
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
if S(s[4]):
|
429 |
+
if END(s[5]):
|
430 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_24
|
431 |
+
|
432 |
+
|
433 |
+
if END(s[4]):
|
434 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_27
|
435 |
+
|
436 |
+
|
437 |
+
if END(s[3]):
|
438 |
+
return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_70
|
439 |
+
|
440 |
+
if C(s[3]):
|
441 |
+
if C(s[4]):
|
442 |
+
if V(s[5]):
|
443 |
+
if C(s[6]):
|
444 |
+
if END(s[7]):
|
445 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_88
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
|
450 |
+
if V(s[4]):
|
451 |
+
if C(s[5]):
|
452 |
+
if M(s[6]):
|
453 |
+
if END(s[7]):
|
454 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_90
|
455 |
+
|
456 |
+
|
457 |
+
|
458 |
+
if END(s[5]):
|
459 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_140
|
460 |
+
|
461 |
+
|
462 |
+
|
463 |
+
|
464 |
+
if V(s[2]):
|
465 |
+
if C(s[3]):
|
466 |
+
if C(s[4]):
|
467 |
+
if M(s[5]):
|
468 |
+
if END(s[6]):
|
469 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_26
|
470 |
+
|
471 |
+
|
472 |
+
if END(s[5]):
|
473 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_37
|
474 |
+
|
475 |
+
|
476 |
+
if M(s[4]):
|
477 |
+
if C(s[5]):
|
478 |
+
if C(s[6]):
|
479 |
+
if END(s[7]):
|
480 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) # SYLLABER_440
|
481 |
+
|
482 |
+
|
483 |
+
|
484 |
+
|
485 |
+
|
486 |
+
if S(s[3]):
|
487 |
+
if C(s[4]):
|
488 |
+
if END(s[5]):
|
489 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_160
|
490 |
+
|
491 |
+
|
492 |
+
|
493 |
+
|
494 |
+
if END(s[2]):
|
495 |
+
return ([s[0], s[1], s[2]], 3, 2) # SYLLABER_32
|
496 |
+
|
497 |
+
if M(s[2]):
|
498 |
+
if C(s[3]):
|
499 |
+
if V(s[4]):
|
500 |
+
if END(s[5]):
|
501 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_58
|
502 |
+
|
503 |
+
if C(s[5]):
|
504 |
+
if END(s[6]):
|
505 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_100
|
506 |
+
|
507 |
+
if V(s[6]):
|
508 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 2) # SYLLABER_420
|
509 |
+
|
510 |
+
|
511 |
+
|
512 |
+
|
513 |
+
if V(s[3]):
|
514 |
+
if END(s[4]):
|
515 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_86
|
516 |
+
|
517 |
+
if S(s[4]):
|
518 |
+
if END(s[5]):
|
519 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) # SYLLABER_110
|
520 |
+
|
521 |
+
|
522 |
+
if C(s[4]):
|
523 |
+
if M(s[5]):
|
524 |
+
if END(s[6]):
|
525 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_150
|
526 |
+
|
527 |
+
|
528 |
+
|
529 |
+
|
530 |
+
|
531 |
+
|
532 |
+
if V(s[1]):
|
533 |
+
if C(s[2]):
|
534 |
+
if M(s[3]):
|
535 |
+
if END(s[4]):
|
536 |
+
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) # SYLLABER_25
|
537 |
+
|
538 |
+
|
539 |
+
if END(s[3]):
|
540 |
+
return ([s[0], s[1]+s[2], s[3]], 4, 2) # SYLLABER_29
|
541 |
+
|
542 |
+
if C(s[3]):
|
543 |
+
if C(s[4]):
|
544 |
+
if C(s[5]):
|
545 |
+
if END(s[6]):
|
546 |
+
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) # SYLLABER_130
|
547 |
+
|
548 |
+
|
549 |
+
|
550 |
+
|
551 |
+
|
552 |
+
|
553 |
+
if S(s[1]):
|
554 |
+
if V(s[2]):
|
555 |
+
if C(s[3]):
|
556 |
+
if V(s[4]):
|
557 |
+
return ([s[0], s[1]+s[2], s[3], s[4]], 5, 2) # SYLLABER_380
|
558 |
+
|
559 |
+
|
560 |
+
|
561 |
+
|
562 |
+
|
563 |
+
|
564 |
+
if __name__ == "__main__":
|
565 |
+
sx = split(u"спросил")
|
566 |
+
print(u"|".join(sx))
|
567 |
+
|
568 |
+
def split_word(word):
|
569 |
+
"""
|
570 |
+
Split single word to syllables
|
571 |
+
:param word: unicode string representing Russian word
|
572 |
+
:return: list of unicode strings for syllables
|
573 |
+
"""
|
574 |
+
return split(word)
|
575 |
+
|
576 |
+
|
577 |
+
def split_words(words):
|
578 |
+
"""
|
579 |
+
Split the words in list to contiguous list of sillables and word separators (single space chars)
|
580 |
+
:param words: list of words (unicode strings)
|
581 |
+
:return: list of tokens - syllables and spaces
|
582 |
+
"""
|
583 |
+
tokens = []
|
584 |
+
for word in words:
|
585 |
+
sx = split(word)
|
586 |
+
if len(tokens) > 0:
|
587 |
+
tokens.append(u' ')
|
588 |
+
tokens.extend(sx)
|
589 |
+
return tokens
|
koziev/rupostagger/ruword2tags.dat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dde47b5f1d48ff899887ac07812dcabd2966e48e84646f3065bfd06627c2af58
|
3 |
+
size 9683765
|
koziev/rupostagger/ruword2tags.py
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
19.04.2019 - при парсинге словарной базы Solarix пропускаются словоформы с
|
4 |
+
отрицательным скорингом (неупотребимые слова).
|
5 |
+
|
6 |
+
26-10-2019 - переход на хранение части словарной базы в SQLite3
|
7 |
+
|
8 |
+
17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
|
9 |
+
|
10 |
+
13.06.2022 если файла БД ruword2tags.db нет, скачаем его и оставим в домашнем каталоге пользователя
|
11 |
+
"""
|
12 |
+
|
13 |
+
import gzip
|
14 |
+
import pathlib
|
15 |
+
import os
|
16 |
+
import pickle
|
17 |
+
import io
|
18 |
+
import argparse
|
19 |
+
import sqlite3
|
20 |
+
import threading
|
21 |
+
|
22 |
+
|
23 |
+
def create_trie_node(char):
|
24 |
+
return char, [], dict()
|
25 |
+
|
26 |
+
|
27 |
+
def add_to_trie_node(node, next_chars, tagset_index):
|
28 |
+
if len(next_chars) == 0:
|
29 |
+
node[1].append(tagset_index)
|
30 |
+
else:
|
31 |
+
next_char = next_chars[0]
|
32 |
+
if next_char not in node[2]:
|
33 |
+
node[2][next_char] = create_trie_node(next_char)
|
34 |
+
|
35 |
+
add_to_trie_node(node[2][next_char], next_chars[1:], tagset_index)
|
36 |
+
|
37 |
+
|
38 |
+
def find_tagsets_in_trie_node(node, word):
|
39 |
+
if word:
|
40 |
+
found_tagsets = []
|
41 |
+
next_char = word[0]
|
42 |
+
if next_char in node[2]:
|
43 |
+
found_tagsets.extend(find_tagsets_in_trie_node(node[2][next_char], word[1:]))
|
44 |
+
return found_tagsets
|
45 |
+
else:
|
46 |
+
return node[1]
|
47 |
+
|
48 |
+
|
49 |
+
def trie_constructed(trie_node, tagset2id):
|
50 |
+
tagset = tuple(sorted(trie_node[1]))
|
51 |
+
if tagset in tagset2id:
|
52 |
+
id_tagsets = tagset2id[tagset]
|
53 |
+
else:
|
54 |
+
id_tagsets = len(tagset2id) + 1
|
55 |
+
tagset2id[tagset] = id_tagsets
|
56 |
+
|
57 |
+
new_children = dict()
|
58 |
+
for next_char, child in trie_node[2].items():
|
59 |
+
new_children[next_char] = trie_constructed(child, tagset2id)
|
60 |
+
|
61 |
+
return (trie_node[0], id_tagsets, new_children)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
class RuWord2Tags:
|
66 |
+
dict_filename = 'ruword2tags.dat'
|
67 |
+
|
68 |
+
def __init__(self):
|
69 |
+
self.ending_len = None
|
70 |
+
self.index2tagset = None
|
71 |
+
self.ending2tagsets = None
|
72 |
+
self.trie_root = None
|
73 |
+
self.all_ending2tagsets = None
|
74 |
+
self.trie_tagsets = None
|
75 |
+
self.db_filepath = None
|
76 |
+
self.cnx = None
|
77 |
+
self.lock = threading.Lock()
|
78 |
+
self.word2tagsets_cache = dict()
|
79 |
+
|
80 |
+
def load(self, dict_path=None):
|
81 |
+
module_folder = str(pathlib.Path(__file__).resolve().parent)
|
82 |
+
self.db_filepath = os.path.join(module_folder, 'database', 'ruword2tags.db')
|
83 |
+
try:
|
84 |
+
# 17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
|
85 |
+
self.cnx = sqlite3.connect(self.db_filepath, check_same_thread=False)
|
86 |
+
except Exception as ex:
|
87 |
+
msg = u'Could not open db file "{}", error: {}'.format(self.db_filepath, ex)
|
88 |
+
raise RuntimeError(msg)
|
89 |
+
|
90 |
+
self.cnx.isolation_level = None
|
91 |
+
self.cur = self.cnx.cursor()
|
92 |
+
|
93 |
+
with open(os.path.join(module_folder,"ruword2tags.dat"), 'rb') as f:
|
94 |
+
data = pickle.load(f)
|
95 |
+
self.ending_lens = data['ending_lens']
|
96 |
+
self.index2tagset = data['index2tagset']
|
97 |
+
self.ending2tagsets = data['ending2tagsets']
|
98 |
+
self.all_ending2tagsets = data['all_ending2tagsets']
|
99 |
+
self.id2tagsets = data['id2tagsets']
|
100 |
+
|
101 |
+
if False:
|
102 |
+
trie_filepath = os.path.join(os.path.dirname(p), 'ruword2tags_trie.dat')
|
103 |
+
with gzip.open(trie_filepath, 'r') as f:
|
104 |
+
self.trie_root = pickle.load(f)
|
105 |
+
|
106 |
+
|
107 |
+
def __getitem__(self, word):
|
108 |
+
hit = False
|
109 |
+
for ending_len in self.ending_lens:
|
110 |
+
ending = word[-ending_len:] if len(word) > ending_len else u''
|
111 |
+
if ending in self.ending2tagsets:
|
112 |
+
for itagset in self.ending2tagsets[ending]:
|
113 |
+
yield self.index2tagset[itagset]
|
114 |
+
hit = True
|
115 |
+
break
|
116 |
+
|
117 |
+
if not hit:
|
118 |
+
#for itagset in find_tagsets_in_trie_node(self.trie_root, word):
|
119 |
+
# hit = True
|
120 |
+
# yield self.index2tagset[itagset]
|
121 |
+
|
122 |
+
if word in self.word2tagsets_cache:
|
123 |
+
id_tagsets = self.word2tagsets_cache[word]
|
124 |
+
for itagset in self.id2tagsets[id_tagsets]:
|
125 |
+
yield self.index2tagset[itagset]
|
126 |
+
hit = True
|
127 |
+
else:
|
128 |
+
with self.lock: # для многопоточной работы в чатботе
|
129 |
+
for r in self.cur.execute('SELECT id_tagsets FROM word_tagsets WHERE word=:word', {'word': word}):
|
130 |
+
id_tagsets = int(r[0])
|
131 |
+
self.word2tagsets_cache[word] = id_tagsets
|
132 |
+
for itagset in self.id2tagsets[id_tagsets]:
|
133 |
+
yield self.index2tagset[itagset]
|
134 |
+
hit = True
|
135 |
+
|
136 |
+
if not hit:
|
137 |
+
for ending_len in reversed(self.ending_lens):
|
138 |
+
ending = word[-ending_len:] if len(word) > ending_len else u''
|
139 |
+
if ending in self.all_ending2tagsets:
|
140 |
+
for itagset in self.all_ending2tagsets[ending]:
|
141 |
+
yield self.index2tagset[itagset]
|
142 |
+
hit = True
|
143 |
+
break
|
144 |
+
|
145 |
+
|
146 |
+
def run_tests(dict_path=None):
|
147 |
+
print('Start testing...')
|
148 |
+
word2tags = RuWord2Tags()
|
149 |
+
word2tags.load(dict_path)
|
150 |
+
|
151 |
+
cases = [(u'очень', [u'НАРЕЧИЕ СТЕПЕНЬ=АТРИБ ТИП_МОДИФ=ГЛАГ ТИП_МОДИФ=НАРЕЧ ТИП_МОДИФ=ПРИЛ']),
|
152 |
+
(u'поскорее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
|
153 |
+
(u'поскорей', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
|
154 |
+
(u'сильнее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 СТЕПЕНЬ=СРАВН']),
|
155 |
+
(u'синее', [u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ВИН РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ИМ РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД']),
|
156 |
+
(u'трахее', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ДАТ РОД=ЖЕН ЧИСЛО=ЕД', u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ПРЕДЛ РОД=ЖЕН ЧИСЛО=ЕД']),
|
157 |
+
(u'полдня', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ИМ ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
|
158 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ВИН ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
|
159 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=РОД ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД'
|
160 |
+
]),
|
161 |
+
(u'а', [u'СОЮЗ', u'ЧАСТИЦА']),
|
162 |
+
(u'кошки', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=ИМ РОД=ЖЕН ЧИСЛО=МН',
|
163 |
+
u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=РОД РОД=ЖЕН ЧИСЛО=ЕД']),
|
164 |
+
(u'на', [#u'ГЛАГОЛ ВИД=НЕСОВЕРШ ЛИЦО=2 НАКЛОНЕНИЕ=ПОБУД ТИП_ГЛАГОЛА=СТАТИЧ ЧИСЛО=ЕД',
|
165 |
+
u'ПРЕДЛОГ ПАДЕЖ=ВИН ПАДЕЖ=МЕСТ ПАДЕЖ=ПРЕДЛ',
|
166 |
+
#u'ЧАСТИЦА'
|
167 |
+
]),
|
168 |
+
(u'заводим', [u'ГЛАГОЛ ВИД=НЕСОВЕРШ ВРЕМЯ=НАСТОЯЩЕЕ ЛИЦО=1 НАКЛОНЕНИЕ=ИЗЪЯВ ПАДЕЖ=ВИН ПАДЕЖ=РОД ПАДЕЖ=ТВОР ЧИСЛО=МН'])
|
169 |
+
]
|
170 |
+
|
171 |
+
for word, required_tagsets in cases:
|
172 |
+
model_tagsets = list(word2tags[word])
|
173 |
+
if len(model_tagsets) != len(required_tagsets):
|
174 |
+
#for tagset in model_tagsets:
|
175 |
+
# print(u'DEBUG@112 word={} tagset={}'.format(word, tagset))
|
176 |
+
raise AssertionError(u'word="{}": {} tagset(s) required, {} found'.format(word, len(required_tagsets), len(model_tagsets)))
|
177 |
+
|
178 |
+
for model_tagset in model_tagsets:
|
179 |
+
if model_tagset not in required_tagsets:
|
180 |
+
raise AssertionError(u'Predicted tagset "{}" for word "{}" is not valid'.format(model_tagset, word))
|
181 |
+
|
182 |
+
print('All tests PASSED.')
|
183 |
+
|
184 |
+
|
185 |
+
def normalize_word(s):
|
186 |
+
if len(s) > 2 and s[0] == "'" and s[-1] == "'":
|
187 |
+
s = s[1:-1]
|
188 |
+
|
189 |
+
return s.replace(' - ', '-').replace('ё', 'е').strip().lower()
|
190 |
+
|
191 |
+
|
192 |
+
ignore_tags = set('ПАДЕЖВАЛ:РОД МОДАЛЬНЫЙ:0 ПЕРЕЧИСЛИМОСТЬ:ДА ПЕРЕХОДНОСТЬ:ПЕРЕХОДНЫЙ ПЕРЕХОДНОСТЬ:НЕПЕРЕХОДНЫЙ ПАДЕЖВАЛ:ТВОР ПАДЕЖВАЛ:ИМ ПАДЕЖВАЛ:ДАТ ПАДЕЖВАЛ:ВИН СГД_ВРЕМЯ:Начать ВОЗВРАТНОСТЬ:0 ВОЗВРАТНОСТЬ:1'.split())
|
193 |
+
|
194 |
+
|
195 |
+
def clean_tagset(tagset):
|
196 |
+
return ' '.join(t for t in tagset.split() if t not in ignore_tags).replace(':', '=')
|
197 |
+
|
198 |
+
|
199 |
+
if __name__ == '__main__':
|
200 |
+
parser = argparse.ArgumentParser(description='Сборка грамматического словаря')
|
201 |
+
parser.add_argument('--src', type=str, default='../data/word2tags.dat', help='Source grammatical dictionary file path')
|
202 |
+
parser.add_argument('--output', type=str, default='../output/ruword2tags.dat', help='Result dictionary file path')
|
203 |
+
parser.add_argument('--words', type=str, help='List of known words (all dictionary words are included by default)')
|
204 |
+
|
205 |
+
args = parser.parse_args()
|
206 |
+
knownwords_file = args.words
|
207 |
+
word2tags_path = args.src
|
208 |
+
output_file = args.output
|
209 |
+
|
210 |
+
# Строим словарь из исходных данных
|
211 |
+
|
212 |
+
known_words = None
|
213 |
+
if knownwords_file is not None:
|
214 |
+
# Загружаем из указанного файла список слов, которые попадут в итоговую модель.
|
215 |
+
print('Загружаем список слов для сборки кастомного словаря из {}'.format(knownwords_file))
|
216 |
+
known_words = set()
|
217 |
+
with io.open(knownwords_file, 'r', encoding='utf-8') as rdr:
|
218 |
+
for line in rdr:
|
219 |
+
word = line.replace(chr(65279), '').strip()
|
220 |
+
known_words.add(word.lower())
|
221 |
+
print('Загружено {} слов из {}'.format(len(known_words), knownwords_file))
|
222 |
+
|
223 |
+
word2tagsets = dict()
|
224 |
+
tagset2index = dict()
|
225 |
+
nb_words = 0
|
226 |
+
filter_negative_scores = True
|
227 |
+
print('Loading dictionary from {}'.format(word2tags_path))
|
228 |
+
|
229 |
+
# В первом проходе по списку словоформ отберем формы, которые будем игнорировать из-за присвоенной
|
230 |
+
# им частоты < 0. Если все варианты распознавания слова имеют присвоенную частоту < 0, то не будем отсекать
|
231 |
+
# такие формы.
|
232 |
+
wordform2max_score = dict()
|
233 |
+
with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
|
234 |
+
for line in rdr:
|
235 |
+
tx = line.replace(chr(65279), '').strip().split('\t')
|
236 |
+
if len(tx) == 5:
|
237 |
+
score = int(tx[4])
|
238 |
+
word = normalize_word(tx[0])
|
239 |
+
wordform2max_score[word] = max(score, wordform2max_score.get(word, -1000000))
|
240 |
+
|
241 |
+
# Основной, второй проход.
|
242 |
+
with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
|
243 |
+
for line in rdr:
|
244 |
+
tx = line.replace(chr(65279), '').strip().split('\t')
|
245 |
+
if len(tx) == 5:
|
246 |
+
word = normalize_word(tx[0])
|
247 |
+
if filter_negative_scores and wordform2max_score[word] >= 0 and int(tx[4]) < 0:
|
248 |
+
# пропускаем формы, которые помечены как редкие или неграмматические (частотность < 0),
|
249 |
+
# и для которых есть альтернативы с частотой >= 0.
|
250 |
+
continue
|
251 |
+
|
252 |
+
if known_words is None or word in known_words:
|
253 |
+
pos = tx[1]
|
254 |
+
lemma = normalize_word(tx[2])
|
255 |
+
tags = clean_tagset(tx[3]) if len(tx) == 5 else u''
|
256 |
+
|
257 |
+
tagset = (pos + ' ' + tags).strip()
|
258 |
+
|
259 |
+
if tagset not in tagset2index:
|
260 |
+
tagset2index[tagset] = len(tagset2index)
|
261 |
+
|
262 |
+
itagset = tagset2index[tagset]
|
263 |
+
|
264 |
+
if word not in word2tagsets:
|
265 |
+
word2tagsets[word] = [itagset]
|
266 |
+
else:
|
267 |
+
word2tagsets[word].append(itagset)
|
268 |
+
|
269 |
+
nb_words += 1
|
270 |
+
|
271 |
+
print('Number of wordentries={}'.format(nb_words))
|
272 |
+
print('Number of tagsets={}'.format(len(tagset2index)))
|
273 |
+
|
274 |
+
for word in u'а и у с к'.split():
|
275 |
+
assert(word in word2tagsets)
|
276 |
+
|
277 |
+
ending_lens = [3, 4, 5]
|
278 |
+
processed_words = set()
|
279 |
+
ending2tagsets = dict()
|
280 |
+
all_ending2tagsets = dict()
|
281 |
+
|
282 |
+
for ending_len in ending_lens:
|
283 |
+
print('Start processing ending_len={}'.format(ending_len))
|
284 |
+
e2tagsets = dict()
|
285 |
+
for word, tagsets in word2tagsets.items():
|
286 |
+
if word not in processed_words and len(word) > ending_len:
|
287 |
+
ending = word[-ending_len:]
|
288 |
+
if ending not in e2tagsets:
|
289 |
+
e2tagsets[ending] = set(tagsets)
|
290 |
+
else:
|
291 |
+
e2tagsets[ending].update(tagsets)
|
292 |
+
|
293 |
+
all_ending2tagsets.update(e2tagsets)
|
294 |
+
print('Number of distinct endings={}'.format(len(e2tagsets)))
|
295 |
+
|
296 |
+
# Уберем окончания, которые дают списки тегов хотя бы с 1 ошибкой
|
297 |
+
bad_endings = set()
|
298 |
+
for word, word_tagsets in word2tagsets.items():
|
299 |
+
if word not in processed_words and len(word) > ending_len:
|
300 |
+
ending = word[-ending_len:]
|
301 |
+
ending_tagsets = e2tagsets[ending]
|
302 |
+
if set(word_tagsets) != ending_tagsets:
|
303 |
+
bad_endings.add(ending)
|
304 |
+
|
305 |
+
print('Number of bad endings={}'.format(len(bad_endings)))
|
306 |
+
|
307 |
+
e2tagsets = dict(filter(lambda z: z[0] not in bad_endings, e2tagsets.items()))
|
308 |
+
|
309 |
+
# Теперь пометим слова, которые подходят под оставшиеся хорошие окончания.
|
310 |
+
nb_matched_words = 0
|
311 |
+
for word in word2tagsets.keys():
|
312 |
+
if len(word) > ending_len:
|
313 |
+
ending = word[-ending_len:]
|
314 |
+
if ending in e2tagsets:
|
315 |
+
processed_words.add(word)
|
316 |
+
nb_matched_words += 1
|
317 |
+
|
318 |
+
print('nb_matched_words={}'.format(nb_matched_words))
|
319 |
+
|
320 |
+
# Переносим оставшиеся хорошие ��кончания в основной список
|
321 |
+
ending2tagsets.update(e2tagsets)
|
322 |
+
|
323 |
+
print('Number of good endings={}'.format(len(ending2tagsets)))
|
324 |
+
print('Number of all endings={}'.format(len(all_ending2tagsets)))
|
325 |
+
|
326 |
+
print('Building TRIE for {} words...'.format(len(word2tagsets)))
|
327 |
+
trie_words = []
|
328 |
+
for word, word_tagsets in word2tagsets.items():
|
329 |
+
if word not in processed_words:
|
330 |
+
# Слово не было обработано окончаниями.
|
331 |
+
for itagset in word_tagsets:
|
332 |
+
trie_words.append((word, itagset))
|
333 |
+
|
334 |
+
trie_root = create_trie_node('')
|
335 |
+
for word, itagset in trie_words:
|
336 |
+
add_to_trie_node(trie_root, word, itagset)
|
337 |
+
|
338 |
+
print('Number of words in TRIE={}'.format(len(trie_words)))
|
339 |
+
|
340 |
+
index2tagset = dict((i, t) for (t, i) in tagset2index.items())
|
341 |
+
|
342 |
+
trie_tagsets = dict()
|
343 |
+
trie_root = trie_constructed(trie_root, trie_tagsets)
|
344 |
+
|
345 |
+
db_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags.db')
|
346 |
+
print('Writing "{}"...'.format(db_filepath))
|
347 |
+
with sqlite3.connect(db_filepath) as cnx:
|
348 |
+
cursor = cnx.cursor()
|
349 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='word_tagsets'")
|
350 |
+
if not cursor.fetchone():
|
351 |
+
cnx.execute('CREATE TABLE word_tagsets(word TEXT NOT NULL PRIMARY KEY, id_tagsets INT not null)')
|
352 |
+
else:
|
353 |
+
cnx.execute('DELETE FROM word_tagsets')
|
354 |
+
|
355 |
+
for word, word_tagsets in word2tagsets.items():
|
356 |
+
if word not in processed_words:
|
357 |
+
tagsets2 = tuple(sorted(word_tagsets))
|
358 |
+
id_tagsets = trie_tagsets[tagsets2]
|
359 |
+
cursor.execute("INSERT INTO word_tagsets(word, id_tagsets) VALUES(:word, :tagsets)",
|
360 |
+
{'word': word, 'tagsets': id_tagsets})
|
361 |
+
|
362 |
+
cnx.commit()
|
363 |
+
|
364 |
+
lexicon_data = {'ending_lens': ending_lens,
|
365 |
+
'index2tagset': index2tagset,
|
366 |
+
'ending2tagsets': ending2tagsets,
|
367 |
+
'all_ending2tagsets': all_ending2tagsets,
|
368 |
+
'id2tagsets': dict((id, tagsets) for (tagsets, id) in trie_tagsets.items())
|
369 |
+
}
|
370 |
+
|
371 |
+
print('Writing "{}"...'.format(output_file))
|
372 |
+
with open(output_file, 'wb') as f:
|
373 |
+
pickle.dump(lexicon_data, f, protocol=2)
|
374 |
+
|
375 |
+
trie_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags_trie.dat')
|
376 |
+
print('Writing "{}"...'.format(trie_filepath))
|
377 |
+
with gzip.open(trie_filepath, 'wb') as f:
|
378 |
+
pickle.dump(trie_root, f)
|
379 |
+
|
380 |
+
#print('Сохранен файл словаря размером {:d} Мб'.format(int(os.path.getsize(output_file)/1000000)))
|
381 |
+
print('All data stored.')
|
382 |
+
|
383 |
+
# Теперь запускаем проверки для построенного словаря
|
384 |
+
run_tests(output_file)
|
385 |
+
|
386 |
+
word2tags = RuWord2Tags()
|
387 |
+
word2tags.load(output_file)
|
388 |
+
|
389 |
+
for word in u'кошки ккошки на'.split():
|
390 |
+
for i, tagset in enumerate(word2tags[word]):
|
391 |
+
print(u'{}[{}] => {}'.format(word, i, tagset))
|
nn/nn_accent/big.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47e69d9ae19f2a82e21b1c70f6a4bbfb1abc5759e98b2e67d009c5e9d7af18c9
|
3 |
+
size 2285217
|
nn/nn_accent/config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "onnx_out",
|
3 |
+
"architectures": [
|
4 |
+
"RoFormerForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.2,
|
7 |
+
"embedding_size": 128,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.2,
|
10 |
+
"hidden_size": 128,
|
11 |
+
"id2label": {
|
12 |
+
"0": "NO",
|
13 |
+
"1": "STRESS_PRIMARY",
|
14 |
+
"2": "STRESS_SECONDARY"
|
15 |
+
},
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 256,
|
18 |
+
"label2id": {
|
19 |
+
"NO": 0,
|
20 |
+
"STRESS_PRIMARY": 1,
|
21 |
+
"STRESS_SECONDARY": 2
|
22 |
+
},
|
23 |
+
"layer_norm_eps": 1e-12,
|
24 |
+
"max_length": 40,
|
25 |
+
"max_position_embeddings": 60,
|
26 |
+
"max_relative_positions": 60,
|
27 |
+
"model_type": "roformer",
|
28 |
+
"num_attention_heads": 8,
|
29 |
+
"num_hidden_layers": 4,
|
30 |
+
"pad_token_id": 0,
|
31 |
+
"relative_attention": true,
|
32 |
+
"rotary_value": false,
|
33 |
+
"transformers_version": "4.29.2",
|
34 |
+
"type_vocab_size": 2,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 45
|
37 |
+
}
|
nn/nn_accent/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e393144e45626f6f1062a0784ef06f921b97321a8e7b87ac2a09a892286500a
|
3 |
+
size 803402
|
nn/nn_accent/ort_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"one_external_file": true,
|
3 |
+
"opset": null,
|
4 |
+
"optimization": {},
|
5 |
+
"optimum_version": "1.8.5",
|
6 |
+
"quantization": {
|
7 |
+
"activations_dtype": "QUInt8",
|
8 |
+
"activations_symmetric": false,
|
9 |
+
"format": "QOperator",
|
10 |
+
"is_static": false,
|
11 |
+
"mode": "IntegerOps",
|
12 |
+
"nodes_to_exclude": [],
|
13 |
+
"nodes_to_quantize": [],
|
14 |
+
"operators_to_quantize": [
|
15 |
+
"MatMul",
|
16 |
+
"Add"
|
17 |
+
],
|
18 |
+
"per_channel": false,
|
19 |
+
"qdq_add_pair_to_weight": false,
|
20 |
+
"qdq_dedicated_pair": false,
|
21 |
+
"qdq_op_type_per_channel_support_to_axis": {
|
22 |
+
"MatMul": 1
|
23 |
+
},
|
24 |
+
"reduce_range": false,
|
25 |
+
"weights_dtype": "QInt8",
|
26 |
+
"weights_symmetric": true
|
27 |
+
},
|
28 |
+
"transformers_version": "4.29.2",
|
29 |
+
"use_external_data_format": false
|
30 |
+
}
|
nn/nn_accent/special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[bos]",
|
3 |
+
"eos_token": "[eos]",
|
4 |
+
"pad_token": "[pad]",
|
5 |
+
"unk_token": "[unk]"
|
6 |
+
}
|
nn/nn_accent/tokenizer_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[bos]",
|
3 |
+
"clean_up_tokenization_spaces": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"eos_token": "[eos]",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"pad_token": "[pad]",
|
8 |
+
"tokenizer_class": "CharTokenizer",
|
9 |
+
"unk_token": "[unk]"
|
10 |
+
}
|
nn/nn_accent/vocab.txt
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pad]
|
2 |
+
[unk]
|
3 |
+
[bos]
|
4 |
+
[eos]
|
5 |
+
'
|
6 |
+
-
|
7 |
+
.
|
8 |
+
?
|
9 |
+
`
|
10 |
+
c
|
11 |
+
e
|
12 |
+
́
|
13 |
+
а
|
14 |
+
б
|
15 |
+
в
|
16 |
+
г
|
17 |
+
д
|
18 |
+
е
|
19 |
+
ж
|
20 |
+
з
|
21 |
+
и
|
22 |
+
й
|
23 |
+
к
|
24 |
+
л
|
25 |
+
м
|
26 |
+
н
|
27 |
+
о
|
28 |
+
п
|
29 |
+
р
|
30 |
+
с
|
31 |
+
т
|
32 |
+
у
|
33 |
+
ф
|
34 |
+
х
|
35 |
+
ц
|
36 |
+
ч
|
37 |
+
ш
|
38 |
+
щ
|
39 |
+
ъ
|
40 |
+
ы
|
41 |
+
ь
|
42 |
+
э
|
43 |
+
ю
|
44 |
+
я
|
45 |
+
ё
|
nn/nn_omograph/big_poetry/added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</w>": 120139,
|
3 |
+
"<w>": 120138
|
4 |
+
}
|
nn/nn_omograph/big_poetry/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "rubert_base/",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"directionality": "bidi",
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"pooler_fc_size": 768,
|
21 |
+
"pooler_num_attention_heads": 12,
|
22 |
+
"pooler_num_fc_layers": 3,
|
23 |
+
"pooler_size_per_head": 128,
|
24 |
+
"pooler_type": "first_token_transform",
|
25 |
+
"position_embedding_type": "absolute",
|
26 |
+
"problem_type": "single_label_classification",
|
27 |
+
"transformers_version": "4.29.2",
|
28 |
+
"type_vocab_size": 2,
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 120140
|
31 |
+
}
|
nn/nn_omograph/big_poetry/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7d1d58e5ad908f4187d3c44f640106b721e293ec954c9c4603abc25ba5f7e8a
|
3 |
+
size 713508364
|
nn/nn_omograph/big_poetry/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
nn/nn_omograph/big_poetry/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nn/nn_omograph/big_poetry/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_basic_tokenize": true,
|
5 |
+
"do_lower_case": true,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "BertTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
nn/nn_omograph/big_poetry/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nn/nn_omograph/medium_poetry/added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</w>": 64001,
|
3 |
+
"<w>": 64000
|
4 |
+
}
|
nn/nn_omograph/medium_poetry/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "SRUElectra-medium/checkpoint-4500000/",
|
3 |
+
"architectures": [
|
4 |
+
"ElectraForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"embedding_size": 576,
|
9 |
+
"generator_size": "0.25",
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 576,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 2304,
|
15 |
+
"layer_norm_eps": 1e-12,
|
16 |
+
"max_position_embeddings": 512,
|
17 |
+
"model_type": "electra",
|
18 |
+
"num_attention_heads": 9,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"position_embedding_type": "absolute",
|
22 |
+
"problem_type": "single_label_classification",
|
23 |
+
"summary_activation": "gelu",
|
24 |
+
"summary_last_dropout": 0.1,
|
25 |
+
"summary_type": "first",
|
26 |
+
"summary_use_proj": true,
|
27 |
+
"transformers_version": "4.29.2",
|
28 |
+
"type_vocab_size": 2,
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 64002
|
31 |
+
}
|
nn/nn_omograph/medium_poetry/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:689752e4bff9eb0b8837482d9ea724f72356aab19822c2e4ae3de6b5a2fc08b1
|
3 |
+
size 341725861
|
nn/nn_omograph/medium_poetry/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
nn/nn_omograph/medium_poetry/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nn/nn_omograph/medium_poetry/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_basic_tokenize": true,
|
5 |
+
"do_lower_case": true,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "ElectraTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
nn/nn_omograph/medium_poetry/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nn/nn_omograph/small_poetry/added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</w>": 30523,
|
3 |
+
"<w>": 30522
|
4 |
+
}
|
nn/nn_omograph/small_poetry/config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "output/checkpoint-440000/",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"dim": 264,
|
9 |
+
"dropout": 0.1,
|
10 |
+
"hidden_dim": 792,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"max_position_embeddings": 512,
|
13 |
+
"model_type": "distilbert",
|
14 |
+
"n_heads": 12,
|
15 |
+
"n_layers": 3,
|
16 |
+
"pad_token_id": 0,
|
17 |
+
"problem_type": "single_label_classification",
|
18 |
+
"qa_dropout": 0.1,
|
19 |
+
"seq_classif_dropout": 0.2,
|
20 |
+
"sinusoidal_pos_embds": false,
|
21 |
+
"transformers_version": "4.29.2",
|
22 |
+
"vocab_size": 30524
|
23 |
+
}
|
nn/nn_omograph/small_poetry/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcea1b8d8c164276d2e593d53261ca3c21d6fc9fed4f04abb8f69e2b95ba842d
|
3 |
+
size 41532079
|
nn/nn_omograph/small_poetry/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
nn/nn_omograph/small_poetry/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nn/nn_omograph/small_poetry/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_basic_tokenize": true,
|
5 |
+
"do_lower_case": false,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "DistilBertTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
nn/nn_omograph/small_poetry/vocab.txt
ADDED
Binary file (382 kB). View file
|
|
nn/nn_omograph/turbo/added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</w>": 50257,
|
3 |
+
"<w>": 50256
|
4 |
+
}
|
nn/nn_omograph/turbo/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "rudeberta_distilled/checkpoint-220000/",
|
3 |
+
"architectures": [
|
4 |
+
"DebertaForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"hidden_act": "gelu",
|
8 |
+
"hidden_dropout_prob": 0.1,
|
9 |
+
"hidden_size": 768,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"intermediate_size": 3072,
|
12 |
+
"layer_norm_eps": 1e-07,
|
13 |
+
"max_position_embeddings": 512,
|
14 |
+
"max_relative_positions": -1,
|
15 |
+
"model_type": "deberta",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 6,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"pooler_dropout": 0,
|
20 |
+
"pooler_hidden_act": "gelu",
|
21 |
+
"pooler_hidden_size": 768,
|
22 |
+
"pos_att_type": null,
|
23 |
+
"position_biased_input": true,
|
24 |
+
"relative_attention": false,
|
25 |
+
"transformers_version": "4.28.1",
|
26 |
+
"type_vocab_size": 0,
|
27 |
+
"vocab_size": 50258
|
28 |
+
}
|
nn/nn_omograph/turbo/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|