Den4ikAI commited on Mar 14, 2024

Commit

153c03b

verified ·

1 Parent(s): 532937c

Upload 66 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
dictionary/accents.json.gz +3 -0
dictionary/accents_nn.json.gz +3 -0
dictionary/omographs.json.gz +3 -0
dictionary/rule_engine/accents.json +0 -0
dictionary/rule_engine/forms.json +85 -0
dictionary/yo_homographs.json.gz +3 -0
dictionary/yo_omographs.json.gz +3 -0
dictionary/yo_words.json.gz +3 -0
koziev/rulemma/rulemma.dat +3 -0
koziev/rulemma/rulemma.py +237 -0
koziev/rupostagger/__init__.py +3 -0
koziev/rupostagger/database/ruword2tags.db +3 -0
koziev/rupostagger/rupostagger.config +11 -0
koziev/rupostagger/rupostagger.model +3 -0
koziev/rupostagger/rupostagger.py +173 -0
koziev/rupostagger/rusyllab.py +589 -0
koziev/rupostagger/ruword2tags.dat +3 -0
koziev/rupostagger/ruword2tags.py +391 -0
nn/nn_accent/big.onnx +3 -0
nn/nn_accent/config.json +37 -0
nn/nn_accent/model.onnx +3 -0
nn/nn_accent/ort_config.json +30 -0
nn/nn_accent/special_tokens_map.json +6 -0
nn/nn_accent/tokenizer_config.json +10 -0
nn/nn_accent/vocab.txt +45 -0
nn/nn_omograph/big_poetry/added_tokens.json +4 -0
nn/nn_omograph/big_poetry/config.json +31 -0
nn/nn_omograph/big_poetry/model.onnx +3 -0
nn/nn_omograph/big_poetry/special_tokens_map.json +7 -0
nn/nn_omograph/big_poetry/tokenizer.json +0 -0
nn/nn_omograph/big_poetry/tokenizer_config.json +15 -0
nn/nn_omograph/big_poetry/vocab.txt +0 -0
nn/nn_omograph/medium_poetry/added_tokens.json +4 -0
nn/nn_omograph/medium_poetry/config.json +31 -0
nn/nn_omograph/medium_poetry/model.onnx +3 -0
nn/nn_omograph/medium_poetry/special_tokens_map.json +7 -0
nn/nn_omograph/medium_poetry/tokenizer.json +0 -0
nn/nn_omograph/medium_poetry/tokenizer_config.json +15 -0
nn/nn_omograph/medium_poetry/vocab.txt +0 -0
nn/nn_omograph/small_poetry/added_tokens.json +4 -0
nn/nn_omograph/small_poetry/config.json +23 -0
nn/nn_omograph/small_poetry/model.onnx +3 -0
nn/nn_omograph/small_poetry/special_tokens_map.json +7 -0
nn/nn_omograph/small_poetry/tokenizer.json +0 -0
nn/nn_omograph/small_poetry/tokenizer_config.json +15 -0
nn/nn_omograph/small_poetry/vocab.txt +0 -0
nn/nn_omograph/turbo/added_tokens.json +4 -0
nn/nn_omograph/turbo/config.json +28 -0
nn/nn_omograph/turbo/merges.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+koziev/rulemma/rulemma.dat filter=lfs diff=lfs merge=lfs -text
+koziev/rupostagger/database/ruword2tags.db filter=lfs diff=lfs merge=lfs -text
+koziev/rupostagger/ruword2tags.dat filter=lfs diff=lfs merge=lfs -text

dictionary/accents.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa460ebba90de00fbbf3d41d121961f605b98667e45efb7920f127473b15515e
+size 20954156

dictionary/accents_nn.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8395664000b80c1afe09bfea3650945b0933482b8e3dee5bb9d429eb18c44935
+size 845996

dictionary/omographs.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04a9e81c68d65f65ba493fe0110f99e79087548c2beeec3032e2b66e28706f36
+size 219047

dictionary/rule_engine/accents.json ADDED Viewed

The diff for this file is too large to render. See raw diff

dictionary/rule_engine/forms.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "diminutive": "",
+  "perfective/imperfective": "Aspect=Perf|Aspect=Imp",
+  "dative/prepositional": "Case=Dat|Case=Prep",
+  "inanimate": "Animacy=Inan",
+  "animate/inanimate": "Animacy=Anim|Animacy=Inan",
+  "dative": "Case=Dat",
+  "second-person": "Person=2",
+  "imperative": "Mood=Imp",
+  "archaic": "",
+  "nominative": "Case=Nom",
+  "locative": "Case=Loc",
+  "masculine": "Gender=Masc",
+  "female": "",
+  "canonical": "",
+  "plural": "Number=Plur",
+  "short": "Variant=Short",
+  "imperfective": "Aspect=Imp",
+  "form": "",
+  "augmentative": "",
+  "masculine/feminine": "Gender=Masc|Gender=Fem",
+  "superlative": "Degree=Sup",
+  "nominative/accusative": "Case=Nom|Case=Acc",
+  "third-person": "Person=3",
+  "nonstandard": "",
+  "genitive": "Case=Gen",
+  "feminine": "Gender=Fem",
+  "masculine/neuter": "Gender=Masc|Gender=Neut",
+  "dative/locative": "Case=Dat|Case=Loc",
+  "genitive/accusative/prepositional": "Case=Gen|Case=Acc|Case=Prep",
+  "partitive": "Case=Par",
+  "genitive/prepositional": "Case=Gen|Case=Prep",
+  "equivalent": "",
+  "endearing": "",
+  "degree": "Degree=",
+  "comparative": "Degree=Cmp",
+  "imperfective/perfective": "Aspect=Imp|Aspect=Perf",
+  "mainly": "",
+  "passive": "Voice=Pass",
+  "first-person": "Person=1",
+  "perfective": "Aspect=Perf",
+  "genitive/dative/instrumental/prepositional": "Case=Gen|Case=Dat|Case=Ins|Case=Prep",
+  "pejorative": "",
+  "accusative": "Case=Acc",
+  "spelling": "",
+  "dative/partitive": "Case=Dat|Case=Par",
+  "old-fashion": "",
+  "possessive": "Poss=Yes",
+  "dative/instrumental": "Case=Dat|Case=Ins",
+  "adverbial": "",
+  "neuter": "Gender=Neut",
+  "future": "Tense=Fut",
+  "neuter/masculine": "Gender=Neut|Gender=Masc",
+  "inanimate/animate": "Animacy=Inan|Animacy=Anim",
+  "(singular": "Number=Sing",
+  "alternative,": "",
+  "participle": "VerbForm=Part",
+  "genitive/accusative": "Case=Gen|Case=Acc",
+  "indicative": "Mood=Ind",
+  "dative/accusative": "Case=Dat|Case=Acc",
+  "singular/plural": "Number=Sing|Number=Plur",
+  "instrumental": "Case=Ins",
+  "&": "",
+  "vocative": "Case=Voc",
+  "prepositional": "Case=Prep",
+  "active": "Voice=Act",
+  "inanimate/animate": "Animacy=Inan|Animacy=Anim",
+  "past": "Tense=Past",
+  "first/second/third-person": "Person=1|Person=2|Person=3",
+  "second-personal": "Person=2",
+  "reflexive": "Reflex=Yes",
+  "singular": "Number=Sing",
+  "accusative/genitive": "Case=Acc|Case=Gen",
+  "acronym": "",
+  "(animated)": "Animacy=Anim",
+  "euphemistic": "",
+  "genitive/dative/prepositional": "Case=Gen|Case=Dat|Case=Prep",
+  "colloquial": "",
+  "a": "",
+  "initialism": "",
+  "present": "Tense=Pres",
+  "obsolete": "",
+  "singulative": "",
+  "animate": "Animacy=Anim"
+}

dictionary/yo_homographs.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4ee777bbbab87f9eac838f370ad92974e079d02b21903e480c54b5f0c8c60d1
+size 5747

dictionary/yo_omographs.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b91cc78dacb5a43e4d5e2e62efdbe5a57799195e5868db35282bee0d9e215a0d
+size 7949

dictionary/yo_words.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a19fa89a964a0691d9fe4ee384783e3934904891843d8f59a1c480d67947a82a
+size 548914

koziev/rulemma/rulemma.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2b3ef3ff7a0aa6e4250aa4e9c8ed568e25f825deebdb12dee1b46b785ba9fc
+size 16703198

koziev/rulemma/rulemma.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# -*- coding: utf-8 -*-
+"""
+Лемматизатор для R&D прототипирования NLP задач в Питоне
+25.03.2020 добавлена ефикация в get_lemma2
+05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT
+"""
+from __future__ import division
+from __future__ import print_function
+import os
+import pickle
+import pathlib
+import gzip
+def decode_pos(pos):
+    if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']:
+        return u'ГЛАГОЛ'
+    else:
+        return pos
+class Lemmatizer(object):
+    def __init__(self):
+        pass
+    def load(self, dict_path=None):
+        """ Загружаем модель лемматизации, созданную отдельным скриптом builder.py """
+        dict_filename = 'rulemma.dat'
+        if dict_path is None:
+            module_folder = str(pathlib.Path(__file__).resolve().parent)
+            p = os.path.join(module_folder, '../tmp', dict_filename)
+            if not os.path.exists(p):
+                p = os.path.join(module_folder, dict_filename)
+        else:
+            p = dict_path
+        with gzip.open(p, 'r') as f:
+            self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f)
+    def get_lemma(self, word):
+        if word in self.forms:
+            return self.forms[word]
+        elif word in self.forms2:
+            return self.forms2[word][0]
+        elif word in self.special_lemmas:
+            return self.special_lemmas[word]
+        else:
+            return word
+    def decode_pos_tags(self, pos_tags):
+        stags1 = []
+        part_of_speech = u'unk'
+        short_tag_index = -1
+        for tag in pos_tags.split('|'):
+            if tag == 'NOUN':
+                part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ'
+            elif tag == 'VERB':
+                part_of_speech = u'ГЛАГОЛ'
+            elif tag == 'ADJ':
+                part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ'
+                stags1.append((u'КРАТКИЙ', u'0'))
+                short_tag_index = 0
+            elif tag == 'ADV':
+                part_of_speech = u'НАРЕЧИЕ'
+            elif tag == 'PRON':
+                part_of_speech = u'МЕСТОИМЕНИЕ'
+            elif tag == 'ADP':
+                part_of_speech = u'ПРЕДЛОГ'
+            elif tag == 'CONJ':
+                part_of_speech = u'СОЮЗ'
+            elif tag == 'PART':
+                part_of_speech = u'ЧАСТИЦА'
+            elif tag == 'PUNCT':
+                part_of_speech = u'ПУНКТУАТОР'
+            elif '=' in tag:
+                if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ':
+                    if tag == u'Case=Nom':
+                        stags1.append((u'ПАДЕЖ', u'ИМ'))
+                    elif tag == u'Case=Acc':
+                        stags1.append((u'ПАДЕЖ', u'ВИН'))
+                    elif tag == u'Case=Dat':
+                        stags1.append((u'ПАДЕЖ', u'ДАТ'))
+                    elif tag == u'Case=Ins':
+                        stags1.append((u'ПАДЕЖ', u'ТВОР'))
+                    elif tag == u'Case=Prep':
+                        stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
+                    elif tag == u'Case=Loc':
+                        stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))  # 03-02-2020 u'МЕСТ'
+                    elif tag == u'Case=Gen':
+                        stags1.append((u'ПАДЕЖ', u'РОД'))
+                    elif tag == u'Case=Voc':
+                        stags1.append((u'ПАДЕЖ', u'ЗВАТ'))
+                    elif tag == u'Number=Sing':
+                        stags1.append((u'ЧИСЛО', u'ЕД'))
+                    elif tag == u'Number=Plur':
+                        stags1.append((u'ЧИСЛО', u'МН'))
+                    elif tag == u'Gender=Masc':
+                        stags1.append((u'РОД', u'МУЖ'))
+                    elif tag == u'Gender=Fem':
+                        stags1.append((u'РОД', u'ЖЕН'))
+                    elif tag == u'Gender=Neut':
+                        stags1.append((u'РОД', u'СР'))
+                    else:
+                        print(u'неизвестный тэг "{}"'.format(tag))
+                        raise NotImplementedError()
+                elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ':
+                    if tag == u'Case=Nom':
+                        stags1.append((u'ПАДЕЖ', u'ИМ'))
+                    elif tag == u'Case=Acc':
+                        stags1.append((u'ПАДЕЖ', u'ВИН'))
+                    elif tag == u'Case=Dat':
+                        stags1.append((u'ПАДЕЖ', u'ДАТ'))
+                    elif tag == u'Case=Ins':
+                        stags1.append((u'ПАДЕЖ', u'ТВОР'))
+                    elif tag == u'Case=Prep':
+                        stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
+                    elif tag == u'Case=Loc':
+                        stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
+                    elif tag == u'Case=Gen':
+                        stags1.append((u'ПАДЕЖ', u'РОД'))
+                    elif tag == u'Number=Sing':
+                        stags1.append((u'ЧИСЛО', u'ЕД'))
+                    elif tag == u'Number=Plur':
+                        stags1.append((u'ЧИСЛО', u'МН'))
+                    elif tag == u'Gender=Masc':
+                        stags1.append((u'РОД', u'МУЖ'))
+                    elif tag == u'Gender=Fem':
+                        stags1.append((u'РОД', u'ЖЕН'))
+                    elif tag == u'Gender=Neut':
+                        stags1.append((u'РОД', u'СР'))
+                    elif tag == u'Degree=Cmp':
+                        stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
+                    elif tag == u'Degree=Pos':
+                        stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
+                    elif tag in (u'Variant=Short', u'Variant=Brev'):
+                        stags1[short_tag_index] = (u'КРАТКИЙ', u'1')
+                    else:
+                        print(u'неизвестный тэг "{}"'.format(tag))
+                        raise NotImplementedError()
+                elif part_of_speech == u'ГЛАГОЛ':
+                    if tag == u'Number=Sing':
+                        stags1.append((u'ЧИСЛО', u'ЕД'))
+                    elif tag == u'Number=Plur':
+                        stags1.append((u'ЧИСЛО', u'МН'))
+                    elif tag == u'Gender=Masc':
+                        stags1.append((u'РОД', u'МУЖ'))
+                    elif tag == u'Gender=Fem':
+                        stags1.append((u'РОД', u'ЖЕН'))
+                    elif tag == u'Gender=Neut':
+                        stags1.append((u'РОД', u'СР'))
+                    elif tag == u'Mood=Ind':
+                        stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ'))
+                    elif tag == u'Mood=Imp':
+                        stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД'))
+                    elif tag == u'Tense=Past':
+                        stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ'))
+                    elif tag == u'Tense=Fut':
+                        stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ'))
+                    elif tag == u'Tense=Notpast':
+                        stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
+                    elif tag == u'Tense=Pres':
+                        stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
+                    elif tag == u'Person=1':
+                        stags1.append((u'ЛИЦО', u'1'))
+                    elif tag == u'Person=2':
+                        stags1.append((u'ЛИЦО', u'2'))
+                    elif tag == u'Person=3':
+                        stags1.append((u'ЛИЦО', u'3'))
+                    elif tag == u'VerbForm=Fin':
+                        pass
+                    elif tag == u'VerbForm=Inf':
+                        pass
+                    elif tag == u'VerbForm=Conv':
+                        pass
+                    else:
+                        msg = u'неизвестный тэг "{}"'.format(tag)
+                        raise RuntimeError(msg)
+                elif part_of_speech == u'НАРЕЧИЕ':
+                    if tag == u'Degree=Pos':
+                        stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
+                    elif tag == u'Degree=Cmp':
+                        stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
+                    else:
+                        raise NotImplementedError()
+                else:
+                    pass
+        return part_of_speech, stags1
+    def get_lemma2(self, word, pos_tags):
+        part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags)
+        nword = word.lower().replace('ё', 'е')
+        if nword in self.special_lemmas:
+            return self.special_lemmas[nword], part_of_speech, decoded_tags
+        if nword in self.forms:
+            lemma = self.forms[nword]
+            return lemma, part_of_speech, decoded_tags
+        elif nword in self.forms2:
+            if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ':
+                # Для существительных учитываем падеж.
+                required_case = None
+                for tag in decoded_tags:
+                    if tag[0] == 'ПАДЕЖ':
+                        required_case = tag[1]
+                        break
+                for lemma, lemma_part_of_speech, tag in self.forms2[nword]:
+                    if lemma_part_of_speech == part_of_speech and tag == required_case:
+                        return lemma, part_of_speech, decoded_tags
+            else:
+                for lemma, lemma_part_of_speech, tags in self.forms2[nword]:
+                    if lemma_part_of_speech == part_of_speech:
+                        return lemma, part_of_speech, decoded_tags
+        elif len(word) > 4:
+            # используем модель лемматизации для OV-слов
+            ending = nword[-4:]
+            key = ending + u'|' + part_of_speech
+            if key in self.key2transducer:
+                transducer = self.key2transducer[key]
+                if transducer[0] > 0:
+                    lemma = word[:-transducer[0]] + transducer[1]
+                else:
+                    lemma = word + transducer[1]
+                return lemma.lower(), part_of_speech, decoded_tags
+        # fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы
+        return nword, part_of_speech, decoded_tags
+    def lemmatize(self, tagged_words):
+        """Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи"""
+        return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]

koziev/rupostagger/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from __future__ import absolute_import
+from .rupostagger import RuPosTagger
+from .rupostagger import run_tests

koziev/rupostagger/database/ruword2tags.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a06848e656bef642aafb4440c03554fa78f2f32dde92ea66f3f86ce9977b167e
+size 168816640

koziev/rupostagger/rupostagger.config ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "w2v_filename": "w2v.CBOW=1_WIN=5_DIM=64.bin",
+    "wc2v_filename": "wordchar2vector.dat",
+    "winspan": 3,
+    "use_w2v": false,
+    "use_gren": true,
+    "use_syllabs": false,
+    "use_shingles": false,
+    "ending_len": 0,
+    "model_filename": "rupostagger.model"
+}

koziev/rupostagger/rupostagger.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21b7b0bfd7427b5fdc1604052176db8aa3b139b3ce03be440cfce48536f8e5ef
+size 2417464

koziev/rupostagger/rupostagger.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# -*- coding: utf-8 -*-
+"""
+Модель частеречной разметки для русскоязычных текстов (проект https://github.com/Koziev/rupostagger)
+03.08.2019 небольшой баг с нормализацией (замена "ё" на "е") перед поиском в грамматическом словаре
+"""
+from __future__ import print_function
+from __future__ import division  # for python2 compatibility
+import os
+import json
+import pathlib
+import re
+import pycrfsuite
+from .ruword2tags import RuWord2Tags
+from .rusyllab import split_word
+BEG_TOKEN = '<beg>'
+END_TOKEN = '<end>'
+token2tag = {BEG_TOKEN: BEG_TOKEN, END_TOKEN: END_TOKEN}
+def is_num(token):
+    return re.match('^[0-9]+$', token)
+class RuPosTagger(object):
+    def __init__(self):
+        self.winspan = -1
+        self.use_w2v = -1
+        self.use_syllabs = -1
+        self.ending_len = -1
+        self.word2tags = None
+    def load(self, word2tags_path=None):
+        module_folder = str(pathlib.Path(__file__).resolve().parent)
+        data_folder = os.path.join(module_folder, '../tmp')
+        config_path = os.path.join(data_folder, 'rupostagger.config')
+        if not os.path.exists(config_path):
+            data_folder = module_folder
+            config_path = os.path.join(data_folder, 'rupostagger.config')
+        #print('DEBUG@47 module_folder={}'.format(module_folder))
+        #print('DEBUG@48 data_folder={}'.format(data_folder))
+        with open(config_path, 'r') as rdr:
+            self.config = json.load(rdr)
+            self.winspan = self.config['winspan']
+            self.use_gren = self.config['use_gren']
+            self.use_w2v = self.config['use_w2v']
+            self.use_syllabs = self.config['use_syllabs']
+            self.ending_len = self.config['ending_len']
+        self.word2tags = RuWord2Tags()
+        self.word2tags.load(word2tags_path)
+        model_path = os.path.join(data_folder, 'rupostagger.model')
+        self.tagger = pycrfsuite.Tagger()
+        self.tagger.open(model_path)
+    @staticmethod
+    def __normalize_word(word):
+        return word.replace(' - ', '-').replace(u'ё', u'е').lower()
+    def get_word_features(self, word, prefix):
+        assert(len(word) > 0)
+        features = []
+        if word in token2tag:
+            features.append((u'tag[{}]={}'.format(prefix, token2tag[word]), 1.0))
+        elif is_num(word):
+            features.append((u'tag[{}]=<num> tag[{}]=<num_{}>'.format(prefix, prefix, word[-1]), 1.0))
+        elif len(word) == 1 and word[0] in u'‼≠™®•·[¡+<>`~;.,‚?!-…№”“„{}|‹›/\'"–—_:«»*]()‘’≈':
+            features.append((u'tag[{}]=punct_{}'.format(prefix, ord(word[0])), 1.0))
+        else:
+            uword = self.__normalize_word(word)
+            first_char = word[0]
+            if first_char in u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
+                features.append((u'word[{}]=<latin>'.format(prefix), 1.0))
+            else:
+                if first_char in u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ':
+                    features.append((u'word[{}]=<upper1>'.format(prefix), 1.0))
+                if self.ending_len > 0:
+                    ending = '~' + uword[-self.ending_len:] if len(uword) > self.ending_len else uword
+                    features.append((u'ending[{}]={}'.format(prefix, ending), 1.0))
+                if self.use_syllabs and first_char.lower() in u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя':
+                    syllabs = split_word(uword)
+                    if len(syllabs) > 0:
+                        if len(syllabs) == 1:
+                            features.append((u'slb[{}]={}'.format(prefix, syllabs[0] + '~'), 1.0))
+                        else:
+                            features.append((u'slb[{}]={}'.format(prefix, syllabs[0]+'~'), 1.0))
+                            for s in syllabs[1:-1]:
+                                features.append((u'slb[{}]={}'.format(prefix, '~'+s+'~'), 1.0))
+                            features.append((u'slb[{}]={}'.format(prefix, '~'+syllabs[-1]), 1.0))
+                if self.use_gren:
+                    tags = set()
+                    for tagset in self.word2tags[uword]:
+                        tags.update(tagset.split(' '))
+                    for tag in tags:
+                        features.append((u'tag[{}]={}'.format(prefix, tag), 1.0))
+        return features
+    def vectorize_sample(self, words):
+        lines2 = []
+        nb_words = len(words)
+        for iword, word in enumerate(words):
+            word_features = dict()
+            for j in range(-self.winspan, self.winspan + 1):
+                iword2 = iword + j
+                if iword2 < 0:
+                    features = [('word[{}]=<beg>'.format(j), 1.0)]
+                elif iword2 >= nb_words:
+                    features = [('word[{}]=<end>'.format(j), 1.0)]
+                else:
+                    features = self.get_word_features(words[iword2], str(j))
+                word_features.update(features)
+            lines2.append(word_features)
+        return lines2
+    def tag(self, words):
+        #X = self.vectorize_sample([BEG_TOKEN]+words+[END_TOKEN])
+        X = self.vectorize_sample(words)
+        y_pred = self.tagger.tag(X)
+        #return zip(words, y_pred[1: -1])
+        return zip(words, y_pred)
+def test1(tagger, phrase, required_labels):
+    pred_labels = list(tagger.tag(phrase.split()))
+    assert(len(required_labels.split()) == len(pred_labels))
+    for required_label, (word, pred_label) in zip(required_labels.split(), pred_labels):
+        for tag in required_label.split('|'):
+            if tag not in pred_label:
+                print(u'Error: phrase={} word={} required_label={} pred_label={}'.format(phrase, word, required_label, pred_label))
+                return False
+    return True
+def run_tests():
+    tagger = RuPosTagger()
+    tagger.load()
+    for phrase, required_labels in [(u'Кошки спят', u'NOUN|Number=Plur|Case=Nom VERB|Mood=Ind|Number=Plur|Person=3|Tense=Notpast|VerbForm=Fin'),
+                                    (u'Я рою колодец', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
+                                    (u'Я мою окно', u'PRON VERB NOUN|Number=Sing|Case=Acc'),
+                                    (u'Ира мыла окно', u'NOUN|Case=Nom VERB NOUN|Number=Sing|Case=Acc'),
+                                    (u'Возьми мою пилу', u'VERB ADJ|Case=Acc NOUN|Case=Acc'),
+                                    (u'рой колодец', u'VERB NOUN|Number=Sing|Case=Acc'),
+                                    (u'У меня живёт черепаха', u'ADP PRON VERB NOUN'),
+                                    (u'какую еду ты любишь ?', u'ADJ NOUN PRON VERB PUNCT')
+                                    ]:
+        if not test1(tagger, phrase, required_labels):
+            print('Tests FAILED')
+            return
+    print('Tests PASSED OK')
+if __name__ == '__main__':
+    run_tests()

koziev/rupostagger/rusyllab.py ADDED Viewed

	@@ -0,0 +1,589 @@

+# -*- coding: utf-8 -*-
+# autogenerated 2019-01-19 10:52:09.746954
+def V(c):
+    return c in u"АЕЁИОУЫЭЮЯаеёиоуыэюя"
+def C(c):
+    return c in u"БВГДЖЗКЛМНПРСТФХЦЧШЩбвгджзклмнпрстфхцчшщ"
+def S(c):
+    return c in u"Йй"
+def M(c):
+    return c in u"ЪЬъь"
+def BEG(c):
+    return c == u"["
+def END(c):
+    return c == u"]"
+def split(s):
+    cur_pos = 0
+    items = list(u"[" + s + u"]")
+    while cur_pos < len(items):
+        input_context = items[cur_pos:]
+        res = apply1(input_context)
+        if res is None:
+            cur_pos += 1
+        else:
+            items = items[:cur_pos] + res[0] + input_context[res[1]:]
+            cur_pos += res[2]
+    return items[1:-1]
+def apply1(s):
+        if C(s[0]):
+            if V(s[1]):
+                if C(s[2]):
+                    if V(s[3]):
+                        return ([s[0]+s[1], s[2], s[3]], 4, 1)  # SYLLABER_1
+                    if C(s[3]):
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_5
+                        if C(s[4]):
+                            if C(s[5]):
+                                if END(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1)  # SYLLABER_11
+                                if not END(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1)  # SYLLABER_12
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_36
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_120
+                            if M(s[5]):
+                                if END(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1)  # SYLLABER_330
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_52
+                        if M(s[4]):
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_76
+                            if C(s[5]):
+                                if V(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1)  # SYLLABER_250
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_260
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_6
+                    if M(s[3]):
+                        if C(s[4]):
+                            if not END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_13
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_39
+                            if C(s[5]):
+                                if C(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1)  # SYLLABER_350
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_14
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_20
+                if END(s[2]):
+                    return ([s[0]+s[1], s[2]], 3, 1)  # SYLLABER_7
+                if S(s[2]):
+                    if C(s[3]):
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_8
+                        if C(s[4]):
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_9
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_280
+                        if M(s[4]):
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_400
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_10
+                    return ([s[0]+s[1]+s[2]], 3, 1)  # SYLLABER_64
+                if V(s[2]):
+                    return ([s[0]+s[1], s[2]], 3, 1)  # SYLLABER_31
+            if C(s[1]):
+                if C(s[2]):
+                    if V(s[3]):
+                        if C(s[4]):
+                            if C(s[5]):
+                                if V(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1)  # SYLLABER_2
+                                if M(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1)  # SYLLABER_310
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_3
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_4
+                            if M(s[5]):
+                                if C(s[6]):
+                                    if M(s[7]):
+                                        if END(s[8]):
+                                            return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6]+s[7], s[8]], 9, 1)  # SYLLABER_300
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1)  # SYLLABER_200
+                        if S(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1)  # SYLLABER_54
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_68
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_170
+                        return ([s[0]+s[1]+s[2]+s[3]], 4, 1)  # SYLLABER_210
+                    if C(s[3]):
+                        if V(s[4]):
+                            if S(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1)  # SYLLABER_220
+                            return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1)  # SYLLABER_98
+                if V(s[2]):
+                    if C(s[3]):
+                        if C(s[4]):
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_15
+                            if C(s[5]):
+                                if C(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1)  # SYLLABER_370
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_80
+                            if M(s[5]):
+                                if V(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1)  # SYLLABER_340
+                                if C(s[6]):
+                                    if V(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1)  # SYLLABER_390
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_470
+                        if M(s[4]):
+                            if not C(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_21
+                            if C(s[5]):
+                                if V(s[6]):
+                                    return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1)  # SYLLABER_48
+                                if C(s[6]):
+                                    if V(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6], s[7]], 8, 1)  # SYLLABER_240
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_62
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_230
+                    if V(s[3]):
+                        if C(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_17
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_82
+                    if S(s[3]):
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_33
+                        if C(s[4]):
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_92
+                            if C(s[5]):
+                                if C(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1)  # SYLLABER_450
+                        return ([s[0]+s[1]+s[2]+s[3]], 4, 1)  # SYLLABER_190
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_66
+                if M(s[2]):
+                    if V(s[3]):
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_410
+                        if C(s[4]):
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_480
+            if M(s[1]):
+                if V(s[2]):
+                    if C(s[3]):
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_16
+                        if C(s[4]):
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_19
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1)  # SYLLABER_290
+                            if C(s[5]):
+                                if C(s[6]):
+                                    if V(s[7]):
+                                        return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1)  # SYLLABER_430
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_22
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_94
+                if C(s[2]):
+                    if V(s[3]):
+                        if S(s[4]):
+                            if END(s[5]):
+                                return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1)  # SYLLABER_320
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_360
+        if V(s[0]):
+            if C(s[1]):
+                if C(s[2]):
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_18
+                    if V(s[3]):
+                        return ([s[0]+s[1], s[2], s[3]], 4, 1)  # SYLLABER_28
+                    if C(s[3]):
+                        if V(s[4]):
+                            if C(s[5]):
+                                return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1)  # SYLLABER_96
+                            return ([s[0]+s[1], s[2], s[3], s[4]], 5, 1)  # SYLLABER_50
+                        if C(s[4]):
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1)  # SYLLABER_460
+                    if M(s[3]):
+                        if END(s[4]):
+                            return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1)  # SYLLABER_72
+                if V(s[2]):
+                    return ([s[0], s[1], s[2]], 3, 1)  # SYLLABER_35
+                if M(s[2]):
+                    if END(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_40
+                    if C(s[3]):
+                        if C(s[4]):
+                            if V(s[5]):
+                                return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1)  # SYLLABER_42
+                        if V(s[4]):
+                            return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1)  # SYLLABER_84
+                    if V(s[3]):
+                        return ([s[0]+s[1]+s[2], s[3]], 4, 1)  # SYLLABER_78
+                if END(s[2]):
+                    return ([s[0]+s[1], s[2]], 3, 1)  # SYLLABER_44
+                return ([s[0]+s[1]], 2, 1)  # SYLLABER_56
+            if END(s[1]):
+                return ([s[0], s[1]], 2, 1)  # SYLLABER_30
+            if V(s[1]):
+                return ([s[0], s[1]], 2, 1)  # SYLLABER_34
+            if S(s[1]):
+                if END(s[2]):
+                    return ([s[0]+s[1], s[2]], 3, 1)  # SYLLABER_46
+                if C(s[2]):
+                    if V(s[3]):
+                        return ([s[0]+s[1], s[2], s[3]], 4, 1)  # SYLLABER_180
+        if BEG(s[0]):
+            if C(s[1]):
+                if C(s[2]):
+                    if V(s[3]):
+                        if C(s[4]):
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_23
+                            if C(s[5]):
+                                if END(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2)  # SYLLABER_60
+                                if M(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2)  # SYLLABER_74
+                        if S(s[4]):
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_24
+                        if END(s[4]):
+                            return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2)  # SYLLABER_27
+                    if END(s[3]):
+                        return ([s[0], s[1]+s[2], s[3]], 4, 2)  # SYLLABER_70
+                    if C(s[3]):
+                        if C(s[4]):
+                            if V(s[5]):
+                                if C(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2)  # SYLLABER_88
+                        if V(s[4]):
+                            if C(s[5]):
+                                if M(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2)  # SYLLABER_90
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_140
+                if V(s[2]):
+                    if C(s[3]):
+                        if C(s[4]):
+                            if M(s[5]):
+                                if END(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2)  # SYLLABER_26
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_37
+                        if M(s[4]):
+                            if C(s[5]):
+                                if C(s[6]):
+                                    if END(s[7]):
+                                        return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2)  # SYLLABER_440
+                    if S(s[3]):
+                        if C(s[4]):
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_160
+                if END(s[2]):
+                    return ([s[0], s[1], s[2]], 3, 2)  # SYLLABER_32
+                if M(s[2]):
+                    if C(s[3]):
+                        if V(s[4]):
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_58
+                            if C(s[5]):
+                                if END(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2)  # SYLLABER_100
+                                if V(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 2)  # SYLLABER_420
+                    if V(s[3]):
+                        if END(s[4]):
+                            return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2)  # SYLLABER_86
+                        if S(s[4]):
+                            if END(s[5]):
+                                return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2)  # SYLLABER_110
+                        if C(s[4]):
+                            if M(s[5]):
+                                if END(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2)  # SYLLABER_150
+            if V(s[1]):
+                if C(s[2]):
+                    if M(s[3]):
+                        if END(s[4]):
+                            return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2)  # SYLLABER_25
+                    if END(s[3]):
+                        return ([s[0], s[1]+s[2], s[3]], 4, 2)  # SYLLABER_29
+                    if C(s[3]):
+                        if C(s[4]):
+                            if C(s[5]):
+                                if END(s[6]):
+                                    return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2)  # SYLLABER_130
+            if S(s[1]):
+                if V(s[2]):
+                    if C(s[3]):
+                        if V(s[4]):
+                            return ([s[0], s[1]+s[2], s[3], s[4]], 5, 2)  # SYLLABER_380
+if __name__ == "__main__":
+    sx = split(u"спросил")
+    print(u"|".join(sx))
+def split_word(word):
+    """
+    Split single word to syllables
+    :param word: unicode string representing Russian word
+    :return: list of unicode strings for syllables
+    """
+    return split(word)
+def split_words(words):
+    """
+    Split the words in list to contiguous list of sillables and word separators (single space chars)
+    :param words: list of words (unicode strings)
+    :return: list of tokens - syllables and spaces
+    """
+    tokens = []
+    for word in words:
+        sx = split(word)
+        if len(tokens) > 0:
+            tokens.append(u' ')
+        tokens.extend(sx)
+    return tokens

koziev/rupostagger/ruword2tags.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dde47b5f1d48ff899887ac07812dcabd2966e48e84646f3065bfd06627c2af58
+size 9683765

koziev/rupostagger/ruword2tags.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# -*- coding: utf-8 -*-
+"""
+19.04.2019 - при парсинге словарной базы Solarix пропускаются словоформы с
+отрицательным скорингом (неупотребимые слова).
+26-10-2019 - переход на хранение части словарной базы в SQLite3
+17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
+13.06.2022 если файла БД ruword2tags.db нет, скачаем его и оставим в домашнем каталоге пользователя
+"""
+import gzip
+import pathlib
+import os
+import pickle
+import io
+import argparse
+import sqlite3
+import threading
+def create_trie_node(char):
+    return char, [], dict()
+def add_to_trie_node(node, next_chars, tagset_index):
+    if len(next_chars) == 0:
+        node[1].append(tagset_index)
+    else:
+        next_char = next_chars[0]
+        if next_char not in node[2]:
+            node[2][next_char] = create_trie_node(next_char)
+        add_to_trie_node(node[2][next_char], next_chars[1:], tagset_index)
+def find_tagsets_in_trie_node(node, word):
+    if word:
+        found_tagsets = []
+        next_char = word[0]
+        if next_char in node[2]:
+            found_tagsets.extend(find_tagsets_in_trie_node(node[2][next_char], word[1:]))
+        return found_tagsets
+    else:
+        return node[1]
+def trie_constructed(trie_node, tagset2id):
+    tagset = tuple(sorted(trie_node[1]))
+    if tagset in tagset2id:
+        id_tagsets = tagset2id[tagset]
+    else:
+        id_tagsets = len(tagset2id) + 1
+        tagset2id[tagset] = id_tagsets
+    new_children = dict()
+    for next_char, child in trie_node[2].items():
+        new_children[next_char] = trie_constructed(child, tagset2id)
+    return (trie_node[0], id_tagsets, new_children)
+class RuWord2Tags:
+    dict_filename = 'ruword2tags.dat'
+    def __init__(self):
+        self.ending_len = None
+        self.index2tagset = None
+        self.ending2tagsets = None
+        self.trie_root = None
+        self.all_ending2tagsets = None
+        self.trie_tagsets = None
+        self.db_filepath = None
+        self.cnx = None
+        self.lock = threading.Lock()
+        self.word2tagsets_cache = dict()
+    def load(self, dict_path=None):
+        module_folder = str(pathlib.Path(__file__).resolve().parent)
+        self.db_filepath = os.path.join(module_folder, 'database', 'ruword2tags.db')
+        try:
+            # 17-06-2020 refs #1 возникает ошибка при работе из нескольких тредов, добавил check_same_thread=False
+            self.cnx = sqlite3.connect(self.db_filepath, check_same_thread=False)
+        except Exception as ex:
+            msg = u'Could not open db file "{}", error: {}'.format(self.db_filepath, ex)
+            raise RuntimeError(msg)
+        self.cnx.isolation_level = None
+        self.cur = self.cnx.cursor()
+        with open(os.path.join(module_folder,"ruword2tags.dat"), 'rb') as f:
+            data = pickle.load(f)
+            self.ending_lens = data['ending_lens']
+            self.index2tagset = data['index2tagset']
+            self.ending2tagsets = data['ending2tagsets']
+            self.all_ending2tagsets = data['all_ending2tagsets']
+            self.id2tagsets = data['id2tagsets']
+        if False:
+            trie_filepath = os.path.join(os.path.dirname(p), 'ruword2tags_trie.dat')
+            with gzip.open(trie_filepath, 'r') as f:
+                self.trie_root = pickle.load(f)
+    def __getitem__(self, word):
+        hit = False
+        for ending_len in self.ending_lens:
+            ending = word[-ending_len:] if len(word) > ending_len else u''
+            if ending in self.ending2tagsets:
+                for itagset in self.ending2tagsets[ending]:
+                    yield self.index2tagset[itagset]
+                hit = True
+                break
+        if not hit:
+            #for itagset in find_tagsets_in_trie_node(self.trie_root, word):
+            #    hit = True
+            #    yield self.index2tagset[itagset]
+            if word in self.word2tagsets_cache:
+                id_tagsets = self.word2tagsets_cache[word]
+                for itagset in self.id2tagsets[id_tagsets]:
+                    yield self.index2tagset[itagset]
+                hit = True
+            else:
+                with self.lock:  # для многопоточной работы в чатботе
+                    for r in self.cur.execute('SELECT id_tagsets FROM word_tagsets WHERE word=:word', {'word': word}):
+                        id_tagsets = int(r[0])
+                        self.word2tagsets_cache[word] = id_tagsets
+                        for itagset in self.id2tagsets[id_tagsets]:
+                            yield self.index2tagset[itagset]
+                        hit = True
+        if not hit:
+            for ending_len in reversed(self.ending_lens):
+                ending = word[-ending_len:] if len(word) > ending_len else u''
+                if ending in self.all_ending2tagsets:
+                    for itagset in self.all_ending2tagsets[ending]:
+                        yield self.index2tagset[itagset]
+                    hit = True
+                    break
+def run_tests(dict_path=None):
+    print('Start testing...')
+    word2tags = RuWord2Tags()
+    word2tags.load(dict_path)
+    cases = [(u'очень', [u'НАРЕЧИЕ СТЕПЕНЬ=АТРИБ ТИП_МОДИФ=ГЛАГ ТИП_МОДИФ=НАРЕЧ ТИП_МОДИФ=ПРИЛ']),
+             (u'поскорее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
+             (u'поскорей', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН ТИП_МОДИФ=ГЛАГ']),
+             (u'сильнее', [u'НАРЕЧИЕ СТЕПЕНЬ=СРАВН', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 СТЕПЕНЬ=СРАВН']),
+             (u'синее', [u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ВИН РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД', u'ПРИЛАГАТЕЛЬНОЕ КРАТКИЙ=0 ПАДЕЖ=ИМ РОД=СР СТЕПЕНЬ=АТРИБ ЧИСЛО=ЕД']),
+             (u'трахее', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ДАТ РОД=ЖЕН ЧИСЛО=ЕД', u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ПРЕДЛ РОД=ЖЕН ЧИСЛО=ЕД']),
+             (u'полдня', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ИМ ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
+                          u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=ВИН ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД',
+                          u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=НЕОДУШ ПАДЕЖ=РОД ПЕРЕЧИСЛИМОСТЬ=НЕТ РОД=МУЖ ЧИСЛО=ЕД'
+                         ]),
+             (u'а', [u'СОЮЗ', u'ЧАСТИЦА']),
+             (u'кошки', [u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=ИМ РОД=ЖЕН ЧИСЛО=МН',
+                         u'СУЩЕСТВИТЕЛЬНОЕ ОДУШ=ОДУШ ПАДЕЖ=РОД РОД=ЖЕН ЧИСЛО=ЕД']),
+             (u'на', [#u'ГЛАГОЛ ВИД=НЕСОВЕРШ ЛИЦО=2 НАКЛОНЕНИЕ=ПОБУД ТИП_ГЛАГОЛА=СТАТИЧ ЧИСЛО=ЕД',
+                      u'ПРЕДЛОГ ПАДЕЖ=ВИН ПАДЕЖ=МЕСТ ПАДЕЖ=ПРЕДЛ',
+                      #u'ЧАСТИЦА'
+                     ]),
+             (u'заводим', [u'ГЛАГОЛ ВИД=НЕСОВЕРШ ВРЕМЯ=НАСТОЯЩЕЕ ЛИЦО=1 НАКЛОНЕНИЕ=ИЗЪЯВ ПАДЕЖ=ВИН ПАДЕЖ=РОД ПАДЕЖ=ТВОР ЧИСЛО=МН'])
+             ]
+    for word, required_tagsets in cases:
+        model_tagsets = list(word2tags[word])
+        if len(model_tagsets) != len(required_tagsets):
+            #for tagset in model_tagsets:
+            #    print(u'DEBUG@112 word={} tagset={}'.format(word, tagset))
+            raise AssertionError(u'word="{}": {} tagset(s) required, {} found'.format(word, len(required_tagsets), len(model_tagsets)))
+        for model_tagset in model_tagsets:
+            if model_tagset not in required_tagsets:
+                raise AssertionError(u'Predicted tagset "{}" for word "{}" is not valid'.format(model_tagset, word))
+    print('All tests PASSED.')
+def normalize_word(s):
+    if len(s) > 2 and s[0] == "'" and s[-1] == "'":
+        s = s[1:-1]
+    return s.replace(' - ', '-').replace('ё', 'е').strip().lower()
+ignore_tags = set('ПАДЕЖВАЛ:РОД МОДАЛЬНЫЙ:0 ПЕРЕЧИСЛИМОСТЬ:ДА ПЕРЕХОДНОСТЬ:ПЕРЕХОДНЫЙ ПЕРЕХОДНОСТЬ:НЕПЕРЕХОДНЫЙ ПАДЕЖВАЛ:ТВОР ПАДЕЖВАЛ:ИМ ПАДЕЖВАЛ:ДАТ ПАДЕЖВАЛ:ВИН СГД_ВРЕМЯ:Начать ВОЗВРАТНОСТЬ:0 ВОЗВРАТНОСТЬ:1'.split())
+def clean_tagset(tagset):
+    return ' '.join(t for t in tagset.split() if t not in ignore_tags).replace(':', '=')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Сборка грамматического словаря')
+    parser.add_argument('--src', type=str, default='../data/word2tags.dat', help='Source grammatical dictionary file path')
+    parser.add_argument('--output', type=str, default='../output/ruword2tags.dat', help='Result dictionary file path')
+    parser.add_argument('--words', type=str, help='List of known words (all dictionary words are included by default)')
+    args = parser.parse_args()
+    knownwords_file = args.words
+    word2tags_path = args.src
+    output_file = args.output
+    # Строим словарь из исходных данных
+    known_words = None
+    if knownwords_file is not None:
+        # Загружаем из указанного файла список слов, которые попадут в итоговую модель.
+        print('Загружаем список слов для сборки кастомного словаря из {}'.format(knownwords_file))
+        known_words = set()
+        with io.open(knownwords_file, 'r', encoding='utf-8') as rdr:
+            for line in rdr:
+                word = line.replace(chr(65279), '').strip()
+                known_words.add(word.lower())
+        print('Загружено {} слов из {}'.format(len(known_words), knownwords_file))
+    word2tagsets = dict()
+    tagset2index = dict()
+    nb_words = 0
+    filter_negative_scores = True
+    print('Loading dictionary from {}'.format(word2tags_path))
+    # В первом проходе по списку словоформ отберем формы, которые будем игнорировать из-за присвоенной
+    # им частоты < 0. Если все варианты распознавания слова имеют присвоенную частоту < 0, то не будем отсекать
+    # такие формы.
+    wordform2max_score = dict()
+    with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
+        for line in rdr:
+            tx = line.replace(chr(65279), '').strip().split('\t')
+            if len(tx) == 5:
+                score = int(tx[4])
+                word = normalize_word(tx[0])
+                wordform2max_score[word] = max(score, wordform2max_score.get(word, -1000000))
+    # Основной, второй проход.
+    with io.open(word2tags_path, 'r', encoding='utf-8') as rdr:
+        for line in rdr:
+            tx = line.replace(chr(65279), '').strip().split('\t')
+            if len(tx) == 5:
+                word = normalize_word(tx[0])
+                if filter_negative_scores and wordform2max_score[word] >= 0 and int(tx[4]) < 0:
+                    # пропускаем формы, которые помечены как редкие или неграмматические (частотность < 0),
+                    # и для которых есть альтернативы с частотой >= 0.
+                    continue
+                if known_words is None or word in known_words:
+                    pos = tx[1]
+                    lemma = normalize_word(tx[2])
+                    tags = clean_tagset(tx[3]) if len(tx) == 5 else u''
+                    tagset = (pos + ' ' + tags).strip()
+                    if tagset not in tagset2index:
+                        tagset2index[tagset] = len(tagset2index)
+                    itagset = tagset2index[tagset]
+                    if word not in word2tagsets:
+                        word2tagsets[word] = [itagset]
+                    else:
+                        word2tagsets[word].append(itagset)
+                    nb_words += 1
+    print('Number of wordentries={}'.format(nb_words))
+    print('Number of tagsets={}'.format(len(tagset2index)))
+    for word in u'а и у с к'.split():
+        assert(word in word2tagsets)
+    ending_lens = [3, 4, 5]
+    processed_words = set()
+    ending2tagsets = dict()
+    all_ending2tagsets = dict()
+    for ending_len in ending_lens:
+        print('Start processing ending_len={}'.format(ending_len))
+        e2tagsets = dict()
+        for word, tagsets in word2tagsets.items():
+            if word not in processed_words and len(word) > ending_len:
+                ending = word[-ending_len:]
+                if ending not in e2tagsets:
+                    e2tagsets[ending] = set(tagsets)
+                else:
+                    e2tagsets[ending].update(tagsets)
+        all_ending2tagsets.update(e2tagsets)
+        print('Number of distinct endings={}'.format(len(e2tagsets)))
+        # Уберем окончания, которые дают списки тегов хотя бы с 1 ошибкой
+        bad_endings = set()
+        for word, word_tagsets in word2tagsets.items():
+            if word not in processed_words and len(word) > ending_len:
+                ending = word[-ending_len:]
+                ending_tagsets = e2tagsets[ending]
+                if set(word_tagsets) != ending_tagsets:
+                    bad_endings.add(ending)
+        print('Number of bad endings={}'.format(len(bad_endings)))
+        e2tagsets = dict(filter(lambda z: z[0] not in bad_endings, e2tagsets.items()))
+        # Теперь пометим слова, которые подходят под оставшиеся хорошие окончания.
+        nb_matched_words = 0
+        for word in word2tagsets.keys():
+            if len(word) > ending_len:
+                ending = word[-ending_len:]
+                if ending in e2tagsets:
+                    processed_words.add(word)
+                    nb_matched_words += 1
+        print('nb_matched_words={}'.format(nb_matched_words))
+        # Переносим оставшиеся хорошие ��кончания в основной список
+        ending2tagsets.update(e2tagsets)
+    print('Number of good endings={}'.format(len(ending2tagsets)))
+    print('Number of all endings={}'.format(len(all_ending2tagsets)))
+    print('Building TRIE for {} words...'.format(len(word2tagsets)))
+    trie_words = []
+    for word, word_tagsets in word2tagsets.items():
+        if word not in processed_words:
+            # Слово не было обработано окончаниями.
+            for itagset in word_tagsets:
+                trie_words.append((word, itagset))
+    trie_root = create_trie_node('')
+    for word, itagset in trie_words:
+        add_to_trie_node(trie_root, word, itagset)
+    print('Number of words in TRIE={}'.format(len(trie_words)))
+    index2tagset = dict((i, t) for (t, i) in tagset2index.items())
+    trie_tagsets = dict()
+    trie_root = trie_constructed(trie_root, trie_tagsets)
+    db_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags.db')
+    print('Writing "{}"...'.format(db_filepath))
+    with sqlite3.connect(db_filepath) as cnx:
+        cursor = cnx.cursor()
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='word_tagsets'")
+        if not cursor.fetchone():
+            cnx.execute('CREATE TABLE word_tagsets(word TEXT NOT NULL PRIMARY KEY, id_tagsets INT not null)')
+        else:
+            cnx.execute('DELETE FROM word_tagsets')
+        for word, word_tagsets in word2tagsets.items():
+            if word not in processed_words:
+                tagsets2 = tuple(sorted(word_tagsets))
+                id_tagsets = trie_tagsets[tagsets2]
+                cursor.execute("INSERT INTO word_tagsets(word, id_tagsets) VALUES(:word, :tagsets)",
+                               {'word': word, 'tagsets': id_tagsets})
+        cnx.commit()
+    lexicon_data = {'ending_lens': ending_lens,
+                    'index2tagset': index2tagset,
+                    'ending2tagsets': ending2tagsets,
+                    'all_ending2tagsets': all_ending2tagsets,
+                    'id2tagsets': dict((id, tagsets) for (tagsets, id) in trie_tagsets.items())
+                    }
+    print('Writing "{}"...'.format(output_file))
+    with open(output_file, 'wb') as f:
+        pickle.dump(lexicon_data, f, protocol=2)
+    trie_filepath = os.path.join(os.path.dirname(output_file), 'ruword2tags_trie.dat')
+    print('Writing "{}"...'.format(trie_filepath))
+    with gzip.open(trie_filepath, 'wb') as f:
+        pickle.dump(trie_root, f)
+    #print('Сохранен файл словаря размером {:d} Мб'.format(int(os.path.getsize(output_file)/1000000)))
+    print('All data stored.')
+    # Теперь запускаем проверки для построенного словаря
+    run_tests(output_file)
+    word2tags = RuWord2Tags()
+    word2tags.load(output_file)
+    for word in u'кошки ккошки на'.split():
+        for i, tagset in enumerate(word2tags[word]):
+            print(u'{}[{}] => {}'.format(word, i, tagset))

nn/nn_accent/big.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47e69d9ae19f2a82e21b1c70f6a4bbfb1abc5759e98b2e67d009c5e9d7af18c9
+size 2285217

nn/nn_accent/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "onnx_out",
+  "architectures": [
+    "RoFormerForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.2,
+  "embedding_size": 128,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.2,
+  "hidden_size": 128,
+  "id2label": {
+    "0": "NO",
+    "1": "STRESS_PRIMARY",
+    "2": "STRESS_SECONDARY"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 256,
+  "label2id": {
+    "NO": 0,
+    "STRESS_PRIMARY": 1,
+    "STRESS_SECONDARY": 2
+  },
+  "layer_norm_eps": 1e-12,
+  "max_length": 40,
+  "max_position_embeddings": 60,
+  "max_relative_positions": 60,
+  "model_type": "roformer",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 4,
+  "pad_token_id": 0,
+  "relative_attention": true,
+  "rotary_value": false,
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 45
+}

nn/nn_accent/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e393144e45626f6f1062a0784ef06f921b97321a8e7b87ac2a09a892286500a
+size 803402

nn/nn_accent/ort_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "optimum_version": "1.8.5",
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "MatMul",
+      "Add"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "transformers_version": "4.29.2",
+  "use_external_data_format": false
+}

nn/nn_accent/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[bos]",
+  "eos_token": "[eos]",
+  "pad_token": "[pad]",
+  "unk_token": "[unk]"
+}

nn/nn_accent/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token": "[bos]",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "[eos]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[pad]",
+  "tokenizer_class": "CharTokenizer",
+  "unk_token": "[unk]"
+}

nn/nn_accent/vocab.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+[pad]
+[unk]
+[bos]
+[eos]
+'
+-
+.
+?
+`
+c
+e
+́
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё

nn/nn_omograph/big_poetry/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</w>": 120139,
+  "<w>": 120138
+}

nn/nn_omograph/big_poetry/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "rubert_base/",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 120140
+}

nn/nn_omograph/big_poetry/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d1d58e5ad908f4187d3c44f640106b721e293ec954c9c4603abc25ba5f7e8a
+size 713508364

nn/nn_omograph/big_poetry/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/big_poetry/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nn/nn_omograph/big_poetry/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/big_poetry/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nn/nn_omograph/medium_poetry/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</w>": 64001,
+  "<w>": 64000
+}

nn/nn_omograph/medium_poetry/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "SRUElectra-medium/checkpoint-4500000/",
+  "architectures": [
+    "ElectraForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "embedding_size": 576,
+  "generator_size": "0.25",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 576,
+  "initializer_range": 0.02,
+  "intermediate_size": 2304,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "electra",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "summary_activation": "gelu",
+  "summary_last_dropout": 0.1,
+  "summary_type": "first",
+  "summary_use_proj": true,
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 64002
+}

nn/nn_omograph/medium_poetry/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:689752e4bff9eb0b8837482d9ea724f72356aab19822c2e4ae3de6b5a2fc08b1
+size 341725861

nn/nn_omograph/medium_poetry/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/medium_poetry/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nn/nn_omograph/medium_poetry/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "ElectraTokenizer",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/medium_poetry/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nn/nn_omograph/small_poetry/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</w>": 30523,
+  "<w>": 30522
+}

nn/nn_omograph/small_poetry/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "output/checkpoint-440000/",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 264,
+  "dropout": 0.1,
+  "hidden_dim": 792,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 3,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.29.2",
+  "vocab_size": 30524
+}

nn/nn_omograph/small_poetry/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcea1b8d8c164276d2e593d53261ca3c21d6fc9fed4f04abb8f69e2b95ba842d
+size 41532079

nn/nn_omograph/small_poetry/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/small_poetry/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nn/nn_omograph/small_poetry/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

nn/nn_omograph/small_poetry/vocab.txt ADDED Viewed

Binary file (382 kB). View file

nn/nn_omograph/turbo/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</w>": 50257,
+  "<w>": 50256
+}

nn/nn_omograph/turbo/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "rudeberta_distilled/checkpoint-220000/",
+  "architectures": [
+    "DebertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": null,
+  "position_biased_input": true,
+  "relative_attention": false,
+  "transformers_version": "4.28.1",
+  "type_vocab_size": 0,
+  "vocab_size": 50258
+}

nn/nn_omograph/turbo/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff