Sam Passaglia commited on
Commit
9aba307
1 Parent(s): f73b6d4

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +178 -0
  2. config/__pycache__/config.cpython-310.pyc +0 -0
  3. config/config.py +102 -0
  4. config/dbert-train-args.json +21 -0
  5. config/heteronyms.json +559 -0
  6. config/heteronyms_Sato2022.json +211 -0
  7. pyproject.toml +65 -0
  8. requirements.txt +25 -0
  9. robot_reading.png +0 -0
  10. stores/dbert/added_tokens.json +64 -0
  11. stores/dbert/config.json +634 -0
  12. stores/dbert/heteronyms.json +567 -0
  13. stores/dbert/label_encoder.json +306 -0
  14. stores/dbert/pytorch_model.bin +3 -0
  15. stores/dbert/special_tokens_map.json +7 -0
  16. stores/dbert/tokenizer_config.json +22 -0
  17. stores/dbert/training_args.bin +3 -0
  18. stores/dbert/training_performance.json +0 -0
  19. stores/dbert/vocab.txt +0 -0
  20. yomikata/__init__.py +0 -0
  21. yomikata/__pycache__/__init__.cpython-310.pyc +0 -0
  22. yomikata/__pycache__/dbert.cpython-310.pyc +0 -0
  23. yomikata/__pycache__/dictionary.cpython-310.pyc +0 -0
  24. yomikata/__pycache__/evaluate.cpython-310.pyc +0 -0
  25. yomikata/__pycache__/main.cpython-310.pyc +0 -0
  26. yomikata/__pycache__/reader.cpython-310.pyc +0 -0
  27. yomikata/__pycache__/t5.cpython-310.pyc +0 -0
  28. yomikata/__pycache__/utils.cpython-310.pyc +0 -0
  29. yomikata/dataset/__init__.py +0 -0
  30. yomikata/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
  31. yomikata/dataset/__pycache__/aozora.cpython-310.pyc +0 -0
  32. yomikata/dataset/__pycache__/bccwj.cpython-310.pyc +0 -0
  33. yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc +0 -0
  34. yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc +0 -0
  35. yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc +0 -0
  36. yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc +0 -0
  37. yomikata/dataset/__pycache__/split.cpython-310.pyc +0 -0
  38. yomikata/dataset/__pycache__/sudachi.cpython-310.pyc +0 -0
  39. yomikata/dataset/__pycache__/unidic.cpython-310.pyc +0 -0
  40. yomikata/dataset/aozora.py +117 -0
  41. yomikata/dataset/bccwj.py +206 -0
  42. yomikata/dataset/kwdlc.py +109 -0
  43. yomikata/dataset/ndlbib.py +46 -0
  44. yomikata/dataset/pronunciations.py +57 -0
  45. yomikata/dataset/repair_long_vowels.py +62 -0
  46. yomikata/dataset/split.py +271 -0
  47. yomikata/dataset/sudachi.py +50 -0
  48. yomikata/dataset/unidic.py +44 -0
  49. yomikata/dbert.py +414 -0
  50. yomikata/dictionary.py +99 -0
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """app.py
2
+ streamlit demo of yomikata"""
3
+ import pandas as pd
4
+ import spacy
5
+ import streamlit as st
6
+ from speach import ttlig
7
+
8
+ from yomikata import utils
9
+ from yomikata.dictionary import Dictionary
10
+ from yomikata.utils import parse_furigana
11
+ from pathlib import Path
12
+
13
+ @st.cache_data
14
+ def add_border(html: str):
15
+ WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
16
+ html = html.replace("\n", " ")
17
+ return WRAPPER.format(html)
18
+
19
+
20
+ def get_random_sentence():
21
+ from config.config import TEST_DATA_DIR
22
+
23
+ df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
24
+ return df.sample(1).iloc[0].sentence
25
+
26
+ @st.cache_data
27
+ def get_dbert_prediction_and_heteronym_list(text):
28
+ from yomikata.dbert import dBert
29
+
30
+ reader = dBert()
31
+ return reader.furigana(text), reader.heteronyms
32
+
33
+ @st.cache_data
34
+ def get_stats():
35
+ from config import config
36
+ from yomikata.utils import load_dict
37
+ stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
38
+
39
+ global_accuracy = stats['test']['accuracy']
40
+
41
+ stats = stats['test']['heteronym_performance']
42
+ heteronyms = stats.keys()
43
+
44
+ accuracy = [stats[heteronym]['accuracy'] for heteronym in heteronyms]
45
+
46
+ readings = [ "、".join(["{reading} ({correct}/{n})".format(reading=reading, correct=stats[heteronym]['readings'][reading]['found'][reading], n=stats[heteronym]['readings'][reading]['n']) for reading in stats[heteronym]['readings'].keys() if (stats[heteronym]['readings'][reading]['found'][reading] !=0 or reading != '<OTHER>')]) for heteronym in heteronyms ]
47
+
48
+ #if reading != '<OTHER>'
49
+
50
+ df = pd.DataFrame({'heteronym': heteronyms, 'accuracy': accuracy, 'readings': readings} )
51
+
52
+ df = df[df['readings'].str.contains('、')]
53
+
54
+ df['readings'] = df['readings'].str.replace('<OTHER>', 'Other')
55
+
56
+ df = df.rename(columns={'readings':'readings (test corr./total)'})
57
+
58
+ df= df.sort_values('accuracy', ascending=False, ignore_index=True)
59
+
60
+ df.index += 1
61
+
62
+ return global_accuracy, df
63
+
64
+
65
+ @st.cache_data
66
+ def furigana_to_spacy(text_with_furigana):
67
+ tokens = parse_furigana(text_with_furigana)
68
+ ents = []
69
+ output_text = ""
70
+ heteronym_count = 0
71
+ for token in tokens.groups:
72
+ if isinstance(token, ttlig.RubyFrag):
73
+ if heteronym_count != 0:
74
+ output_text += ", "
75
+
76
+ ents.append(
77
+ {
78
+ "start": len(output_text),
79
+ "end": len(output_text) + len(token.text),
80
+ "label": token.furi,
81
+ }
82
+ )
83
+
84
+ output_text += token.text
85
+ heteronym_count += 1
86
+ else:
87
+ pass
88
+ return {
89
+ "text": output_text,
90
+ "ents": ents,
91
+ "title": None,
92
+ }
93
+
94
+
95
+ st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
96
+
97
+ # Input text box
98
+ st.markdown("Input a Japanese sentence:")
99
+
100
+ if "default_sentence" not in st.session_state:
101
+ st.session_state.default_sentence = "え、{人間/にんげん}というものかい? {人間/にんげん}というものは{角/つの}の{生/は}えない、{生白/なまじろ}い{顔/かお}や{手足/てあし}をした、{何/なん}ともいわれず{気味/きみ}の{悪/わる}いものだよ。"
102
+
103
+ input_text = st.text_area(
104
+ "Input a Japanese sentence:",
105
+ utils.remove_furigana(st.session_state.default_sentence),
106
+ label_visibility="collapsed",
107
+ )
108
+
109
+ # Yomikata prediction
110
+ dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
111
+
112
+ # spacy-style output for the predictions
113
+ colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
114
+ spacy_dict = furigana_to_spacy(dbert_prediction)
115
+ label_colors = {
116
+ reading: colors[i % len(colors)]
117
+ for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
118
+ }
119
+ html = spacy.displacy.render(
120
+ spacy_dict, style="ent", manual=True, options={"colors": label_colors}
121
+ )
122
+
123
+ if len(spacy_dict["ents"]) > 0:
124
+ st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
125
+ st.write(
126
+ f"{add_border(html)}",
127
+ unsafe_allow_html=True,
128
+ )
129
+ else:
130
+ st.markdown("**Yomikata** found no heteronyms in the input text.")
131
+
132
+ # Dictionary + Yomikata prediction
133
+ st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
134
+ dictionary = st.radio(
135
+ "It can be coupled with a dictionary",
136
+ ("sudachi", "unidic", "ipadic", "juman"),
137
+ horizontal=True,
138
+ label_visibility="collapsed",
139
+ )
140
+
141
+ dictreader = Dictionary(dictionary)
142
+ dictionary_prediction = dictreader.furigana(dbert_prediction)
143
+ html = parse_furigana(dictionary_prediction).to_html()
144
+ st.write(
145
+ f"{add_border(html)}",
146
+ unsafe_allow_html=True,
147
+ )
148
+
149
+ # Dictionary alone prediction
150
+ if len(spacy_dict["ents"]) > 0:
151
+ dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text))
152
+ html = parse_furigana(dictionary_prediction).to_html()
153
+ st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:")
154
+ st.write(
155
+ f"{add_border(html)}",
156
+ unsafe_allow_html=True,
157
+ )
158
+
159
+ # Randomize button
160
+ if st.button("🎲 Randomize the input sentence"):
161
+ st.session_state.default_sentence = get_random_sentence()
162
+ st.experimental_rerun()
163
+
164
+ # Stats section
165
+ global_accuracy, stats_df = get_stats()
166
+
167
+ st.subheader(f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}")
168
+
169
+ st.dataframe(stats_df)
170
+
171
+ # Hide the footer
172
+ hide_streamlit_style = """
173
+ <style>
174
+ #MainMenu {visibility: hidden;}
175
+ footer {visibility: hidden;}
176
+ </style>
177
+ """
178
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
config/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.95 kB). View file
 
config/config.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ import json
4
+ import logging.config
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import mlflow
9
+ from rich.logging import RichHandler
10
+
11
+ # Base and Config Directories
12
+ BASE_DIR = Path(__file__).parent.parent.absolute()
13
+ CONFIG_DIR = Path(BASE_DIR, "config")
14
+
15
+ # Data Directories
16
+ RAW_DATA_DIR = Path(BASE_DIR, "raw_data")
17
+ SENTENCE_DATA_DIR = Path(BASE_DIR, "sentence_data")
18
+ TRAIN_DATA_DIR = Path(SENTENCE_DATA_DIR, "train")
19
+ VAL_DATA_DIR = Path(SENTENCE_DATA_DIR, "val")
20
+ TEST_DATA_DIR = Path(SENTENCE_DATA_DIR, "test")
21
+ READING_DATA_DIR = Path(BASE_DIR, "reading_data")
22
+
23
+ # Logs Directory
24
+ LOGS_DIR = Path(BASE_DIR, "logs")
25
+
26
+ # Model Storage Directory
27
+ STORES_DIR = Path(BASE_DIR, "stores")
28
+ RUN_REGISTRY = Path(STORES_DIR, "runs")
29
+
30
+ # Create dirs
31
+ RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
32
+ SENTENCE_DATA_DIR.mkdir(parents=True, exist_ok=True)
33
+ TRAIN_DATA_DIR.mkdir(parents=True, exist_ok=True)
34
+ VAL_DATA_DIR.mkdir(parents=True, exist_ok=True)
35
+ TEST_DATA_DIR.mkdir(parents=True, exist_ok=True)
36
+ READING_DATA_DIR.mkdir(parents=True, exist_ok=True)
37
+ LOGS_DIR.mkdir(parents=True, exist_ok=True)
38
+ STORES_DIR.mkdir(parents=True, exist_ok=True)
39
+ RUN_REGISTRY.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Special tokens reserved
42
+ ASCII_SPACE_TOKEN = "\U0000FFFF" # this is used to replace the usual space characters before sending text to mecab, because mecab uses the usual space to separate words.
43
+
44
+ # Seed
45
+ SEED = 1271297
46
+
47
+ # Training parameters
48
+ TRAIN_SIZE = 0.7
49
+ VAL_SIZE = 0.15
50
+ TEST_SIZE = 0.15
51
+ assert TRAIN_SIZE + VAL_SIZE + TEST_SIZE == 1
52
+
53
+ # Heteronym list
54
+ with open(Path(CONFIG_DIR, "heteronyms.json")) as fp:
55
+ HETERONYMS = json.load(fp)
56
+
57
+ # MLFlow model registry
58
+ mlflow.set_tracking_uri("file://" + str(RUN_REGISTRY.absolute()))
59
+
60
+ # Logger
61
+ logging_config = {
62
+ "version": 1,
63
+ "disable_existing_loggers": False,
64
+ "formatters": {
65
+ "minimal": {"format": "%(message)s"},
66
+ "detailed": {
67
+ "format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"
68
+ },
69
+ },
70
+ "handlers": {
71
+ "console": {
72
+ "class": "logging.StreamHandler",
73
+ "stream": sys.stdout,
74
+ "formatter": "minimal",
75
+ "level": logging.DEBUG,
76
+ },
77
+ "info": {
78
+ "class": "logging.handlers.RotatingFileHandler",
79
+ "filename": Path(LOGS_DIR, "info.log"),
80
+ "maxBytes": 10485760, # 1 MB
81
+ "backupCount": 10,
82
+ "formatter": "detailed",
83
+ "level": logging.INFO,
84
+ },
85
+ "error": {
86
+ "class": "logging.handlers.RotatingFileHandler",
87
+ "filename": Path(LOGS_DIR, "error.log"),
88
+ "maxBytes": 10485760, # 1 MB
89
+ "backupCount": 10,
90
+ "formatter": "detailed",
91
+ "level": logging.ERROR,
92
+ },
93
+ },
94
+ "root": {
95
+ "handlers": ["console", "info", "error"],
96
+ "level": logging.INFO,
97
+ "propagate": True,
98
+ },
99
+ }
100
+ logging.config.dictConfig(logging_config)
101
+ logger = logging.getLogger()
102
+ logger.handlers[0] = RichHandler(markup=True)
config/dbert-train-args.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "dBert",
3
+ "dataset": "optimized_strict_heteronyms",
4
+ "experiment": "train-dBert",
5
+ "run": "test",
6
+ "num_train_epochs": 10,
7
+ "evaluation_strategy": "steps",
8
+ "eval_steps": 300,
9
+ "logging_strategy": "steps",
10
+ "logging_steps": 300,
11
+ "save_strategy": "steps",
12
+ "save_steps": 300,
13
+ "learning_rate": 2e-5,
14
+ "per_device_train_batch_size": 128,
15
+ "per_device_eval_batch_size": 128,
16
+ "load_best_model_at_end": true,
17
+ "metric_for_best_model": "loss",
18
+ "weight_decay": 0.01,
19
+ "save_total_limit": 5,
20
+ "report_to": "mlflow"
21
+ }
config/heteronyms.json ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "表": {
3
+ "ひょう": 3349,
4
+ "おもて": 3034,
5
+ "あらわ": 2474,
6
+ "あら": 731
7
+ },
8
+ "角": {
9
+ "かく": 4360,
10
+ "かど": 2303,
11
+ "つの": 372,
12
+ "すみ": 70
13
+ },
14
+ "大分": {
15
+ "おおいた": 3358,
16
+ "だいぶ": 797,
17
+ "だいぶん": 97
18
+ },
19
+ "国立": {
20
+ "こくりつ": 19256,
21
+ "くにたち": 246
22
+ },
23
+ "人気": {
24
+ "にんき": 7383,
25
+ "ひとけ": 149,
26
+ "じんき": 44
27
+ },
28
+ "市場": {
29
+ "しじょう": 85107,
30
+ "いちば": 781
31
+ },
32
+ "気質": {
33
+ "きしつ": 1108,
34
+ "かたぎ": 398
35
+ },
36
+ "上方": {
37
+ "かみがた": 1411,
38
+ "じょうほう": 656
39
+ },
40
+ "上手": {
41
+ "じょうず": 8065,
42
+ "うま": 706,
43
+ "かみて": 150,
44
+ "うわて": 57
45
+ },
46
+ "下手": {
47
+ "へた": 849,
48
+ "したて": 128,
49
+ "べた": 121,
50
+ "しもて": 50
51
+ },
52
+ "仮名": {
53
+ "かな": 1407,
54
+ "がな": 129,
55
+ "かめい": 115
56
+ },
57
+ "礼拝": {
58
+ "れいはい": 841,
59
+ "らいはい": 62
60
+ },
61
+ "遺言": {
62
+ "ゆいごん": 3152,
63
+ "いげん": 67,
64
+ "いごん": 57
65
+ },
66
+ "口腔": {
67
+ "こうこう": 6475,
68
+ "こうくう": 5577
69
+ },
70
+ "骨": {
71
+ "ほね": 10697,
72
+ "こつ": 5870
73
+ },
74
+ "一途": {
75
+ "いちず": 576,
76
+ "いっと": 139
77
+ },
78
+ "一言": {
79
+ "ひとこと": 2567,
80
+ "いちげん": 133,
81
+ "いちごん": 106
82
+ },
83
+ "最中": {
84
+ "さいちゅう": 520,
85
+ "さなか": 43
86
+ },
87
+ "一目": {
88
+ "ひとめ": 1596,
89
+ "いちもく": 210
90
+ },
91
+ "係": {
92
+ "かか": 14218,
93
+ "かかわ": 9804,
94
+ "がかり": 234,
95
+ "かかり": 227
96
+ },
97
+ "足跡": {
98
+ "あしあと": 2626,
99
+ "そくせき": 1862
100
+ },
101
+ "今日": {
102
+ "きょう": 17624,
103
+ "こんにち": 6772
104
+ },
105
+ "明日": {
106
+ "あす": 9824,
107
+ "あした": 6606,
108
+ "みょうにち": 66
109
+ },
110
+ "生物": {
111
+ "せいぶつ": 26088,
112
+ "いきもの": 55
113
+ },
114
+ "変化": {
115
+ "へんか": 87895,
116
+ "へんげ": 337
117
+ },
118
+ "大事": {
119
+ "だいじ": 5293,
120
+ "おおごと": 54
121
+ },
122
+ "大家": {
123
+ "たいか": 586,
124
+ "おおや": 238,
125
+ "たいけ": 79
126
+ },
127
+ "心中": {
128
+ "しんじゅう": 1541,
129
+ "しんちゅう": 250,
130
+ "しんぢゅう": 127
131
+ },
132
+ "一行": {
133
+ "いっこう": 1112,
134
+ "いちぎょう": 95
135
+ },
136
+ "一時": {
137
+ "いちじ": 2649,
138
+ "いっとき": 381,
139
+ "いちどき": 47
140
+ },
141
+ "一方": {
142
+ "いっぽう": 5327,
143
+ "ひとかた": 112,
144
+ "いちほう": 42
145
+ },
146
+ "一夜": {
147
+ "いちや": 1148,
148
+ "ひとよ": 82
149
+ },
150
+ "下野": {
151
+ "しもつけ": 530,
152
+ "げや": 104,
153
+ "しもの": 57
154
+ },
155
+ "花弁": {
156
+ "かべん": 213,
157
+ "はなびら": 58
158
+ },
159
+ "玩具": {
160
+ "がんぐ": 1354,
161
+ "おもちゃ": 238
162
+ },
163
+ "強力": {
164
+ "きょうりょく": 2319,
165
+ "ごうりき": 51
166
+ },
167
+ "金色": {
168
+ "きんいろ": 942,
169
+ "こんじき": 484
170
+ },
171
+ "経緯": {
172
+ "けいい": 7659,
173
+ "いきさつ": 56
174
+ },
175
+ "故郷": {
176
+ "こきょう": 3840,
177
+ "ふるさと": 506,
178
+ "くに": 122
179
+ },
180
+ "紅葉": {
181
+ "こうよう": 856,
182
+ "もみじ": 339
183
+ },
184
+ "根本": {
185
+ "こんぽん": 2872,
186
+ "ねもと": 262
187
+ },
188
+ "山陰": {
189
+ "さんいん": 2094,
190
+ "やまかげ": 51
191
+ },
192
+ "上下": {
193
+ "じょうげ": 1549,
194
+ "うえした": 97
195
+ },
196
+ "身体": {
197
+ "しんたい": 20301,
198
+ "からだ": 3375
199
+ },
200
+ "水面": {
201
+ "すいめん": 1387,
202
+ "みなも": 91
203
+ },
204
+ "世論": {
205
+ "よろん": 4554,
206
+ "せろん": 1934
207
+ },
208
+ "清水": {
209
+ "しみず": 4114,
210
+ "きよみず": 98
211
+ },
212
+ "大手": {
213
+ "おおて": 6695,
214
+ "おおで": 119
215
+ },
216
+ "大人": {
217
+ "おとな": 11037,
218
+ "たいじん": 113,
219
+ "うし": 59
220
+ },
221
+ "大勢": {
222
+ "おおぜい": 1290,
223
+ "たいせい": 398
224
+ },
225
+ "中間": {
226
+ "ちゅうかん": 17669,
227
+ "ちゅうげん": 144
228
+ },
229
+ "日向": {
230
+ "ひゅうが": 800,
231
+ "ひなた": 318
232
+ },
233
+ "夫婦": {
234
+ "ふうふ": 9165,
235
+ "めおと": 354
236
+ },
237
+ "牧場": {
238
+ "ぼくじょう": 1913,
239
+ "まきば": 159
240
+ },
241
+ "末期": {
242
+ "まっき": 3569,
243
+ "まつご": 78
244
+ },
245
+ "利益": {
246
+ "りえき": 13434,
247
+ "りやく": 209
248
+ },
249
+ "一味": {
250
+ "いちみ": 442,
251
+ "ひとあじ": 60
252
+ },
253
+ "魚": {
254
+ "さかな": 5857,
255
+ "うお": 1706,
256
+ "ぎょ": 413,
257
+ "ざかな": 50
258
+ },
259
+ "施行": {
260
+ "しこう": 18724,
261
+ "せこう": 70
262
+ },
263
+ "施工": {
264
+ "せこう": 25734,
265
+ "しこう": 48,
266
+ "せこ": 43
267
+ },
268
+ "転生": {
269
+ "てんせい": 911,
270
+ "てんしょう": 175
271
+ },
272
+ "博士": {
273
+ "はくし": 17017,
274
+ "はかせ": 2462
275
+ },
276
+ "眼鏡": {
277
+ "めがね": 2040,
278
+ "がんきょう": 102
279
+ },
280
+ "文字": {
281
+ "もじ": 9583,
282
+ "もんじ": 633
283
+ },
284
+ "文書": {
285
+ "ぶんしょ": 15094,
286
+ "もんじょ": 5879,
287
+ "もんしょ": 51
288
+ },
289
+ "現世": {
290
+ "げんせい": 192,
291
+ "げんせ": 125
292
+ },
293
+ "日中": {
294
+ "にっちゅう": 12478,
295
+ "にちじゅう": 117
296
+ },
297
+ "夜中": {
298
+ "よなか": 723,
299
+ "やちゅう": 106
300
+ },
301
+ "二人": {
302
+ "ふたり": 22151,
303
+ "ににん": 256
304
+ },
305
+ "見物": {
306
+ "けんぶつ": 1832,
307
+ "みもの": 61
308
+ },
309
+ "清浄": {
310
+ "せいじょう": 800,
311
+ "しょうじょう": 46
312
+ },
313
+ "谷間": {
314
+ "たにま": 1089,
315
+ "たにあい": 67
316
+ },
317
+ "追従": {
318
+ "ついじゅう": 1000,
319
+ "ついしょう": 73
320
+ },
321
+ "墓石": {
322
+ "はかいし": 323,
323
+ "ぼせき": 257
324
+ },
325
+ "漢書": {
326
+ "かんじょ": 171,
327
+ "かんしょ": 66,
328
+ "からぶみ": 47
329
+ },
330
+ "作法": {
331
+ "さほう": 3905,
332
+ "さくほう": 427
333
+ },
334
+ "半月": {
335
+ "はんつき": 388,
336
+ "はんげつ": 85
337
+ },
338
+ "黒子": {
339
+ "ほくろ": 200,
340
+ "くろこ": 183
341
+ },
342
+ "競売": {
343
+ "けいばい": 937,
344
+ "きょうばい": 332
345
+ },
346
+ "開眼": {
347
+ "かいげん": 338,
348
+ "かいがん": 144
349
+ },
350
+ "求道": {
351
+ "きゅうどう": 379,
352
+ "ぐどう": 81
353
+ },
354
+ "施業": {
355
+ "せぎょう": 602,
356
+ "しぎょう": 264
357
+ },
358
+ "借家": {
359
+ "しゃっか": 505,
360
+ "しゃくや": 394
361
+ },
362
+ "法衣": {
363
+ "ころも": 115,
364
+ "ほうえ": 87
365
+ },
366
+ "昨日": {
367
+ "きのう": 2670,
368
+ "さくじつ": 713
369
+ },
370
+ "風車": {
371
+ "ふうしゃ": 1133,
372
+ "かざぐるま": 678
373
+ },
374
+ "寒気": {
375
+ "かんき": 153,
376
+ "さむけ": 79
377
+ },
378
+ "背筋": {
379
+ "せすじ": 177,
380
+ "はいきん": 43
381
+ },
382
+ "逆手": {
383
+ "さかて": 169,
384
+ "ぎゃくて": 116
385
+ },
386
+ "生花": {
387
+ "いけばな": 283,
388
+ "せいか": 91
389
+ },
390
+ "白髪": {
391
+ "しらが": 313,
392
+ "はくはつ": 113
393
+ },
394
+ "一月": {
395
+ "ひとつき": 301,
396
+ "いちがつ": 282
397
+ },
398
+ "一寸": {
399
+ "ちょっと": 1481,
400
+ "いっすん": 111
401
+ },
402
+ "一声": {
403
+ "ひとこえ": 253,
404
+ "いっせい": 109
405
+ },
406
+ "一日": {
407
+ "いちにち": 1711,
408
+ "ついたち": 866,
409
+ "いちじつ": 41
410
+ },
411
+ "一分": {
412
+ "いちぶん": 75,
413
+ "いちぶ": 62
414
+ },
415
+ "一文": {
416
+ "いちもん": 86,
417
+ "いちぶん": 48
418
+ },
419
+ "何時": {
420
+ "いつ": 1248,
421
+ "なんじ": 159,
422
+ "なんどき": 63
423
+ },
424
+ "何分": {
425
+ "なにぶん": 379,
426
+ "なんぷん": 51
427
+ },
428
+ "気骨": {
429
+ "きこつ": 140,
430
+ "きぼね": 67
431
+ },
432
+ "銀杏": {
433
+ "いちょう": 322,
434
+ "ぎんなん": 85
435
+ },
436
+ "細々": {
437
+ "こまごま": 88,
438
+ "ほそぼそ": 67
439
+ },
440
+ "細目": {
441
+ "さいもく": 962,
442
+ "ほそめ": 123
443
+ },
444
+ "疾風": {
445
+ "しっぷう": 544,
446
+ "はやて": 94,
447
+ "かぜ": 68
448
+ },
449
+ "菖蒲": {
450
+ "しょうぶ": 165,
451
+ "あやめ": 65
452
+ },
453
+ "船底": {
454
+ "せんてい": 246,
455
+ "ふなぞこ": 80
456
+ },
457
+ "相乗": {
458
+ "そうじょう": 732,
459
+ "あいの": 89
460
+ },
461
+ "造作": {
462
+ "ぞうさ": 188,
463
+ "ぞうさく": 65
464
+ },
465
+ "頭数": {
466
+ "あたまかず": 168,
467
+ "とうすう": 119
468
+ },
469
+ "二重": {
470
+ "にじゅう": 5418,
471
+ "ふたえ": 65
472
+ },
473
+ "日暮": {
474
+ "ひぐ": 403,
475
+ "ひぐれ": 97,
476
+ "ひぐらし": 81
477
+ },
478
+ "梅雨": {
479
+ "つゆ": 471,
480
+ "ばいう": 284
481
+ },
482
+ "風穴": {
483
+ "かざあな": 300,
484
+ "ふうけつ": 68
485
+ },
486
+ "分別": {
487
+ "ふんべつ": 1280,
488
+ "ぶんべつ": 635
489
+ },
490
+ "夜話": {
491
+ "やわ": 2153,
492
+ "よばなし": 52
493
+ },
494
+ "野兎": {
495
+ "やと": 176,
496
+ "のうさぎ": 43
497
+ },
498
+ "冷水": {
499
+ "れいすい": 189,
500
+ "ひやみず": 153
501
+ },
502
+ "連中": {
503
+ "れんじゅう": 853,
504
+ "れんちゅう": 691
505
+ },
506
+ "飛沫": {
507
+ "ひまつ": 223,
508
+ "しぶき": 96
509
+ },
510
+ "翡翠": {
511
+ "ひすい": 177,
512
+ "かわせみ": 94
513
+ },
514
+ "一昨日": {
515
+ "おととい": 208,
516
+ "いっさくじつ": 71
517
+ },
518
+ "一昨年": {
519
+ "おととし": 72,
520
+ "いっさくねん": 59
521
+ },
522
+ "十八番": {
523
+ "じゅうはちばん": 212,
524
+ "おはこ": 41
525
+ },
526
+ "明後日": {
527
+ "あさって": 186,
528
+ "みょうごにち": 60
529
+ },
530
+ "石綿": {
531
+ "いしわた": 1702,
532
+ "せきめん": 360
533
+ },
534
+ "公文": {
535
+ "こうぶん": 196,
536
+ "くもん": 46
537
+ },
538
+ "読本": {
539
+ "どくほん": 12176,
540
+ "とくほん": 2414,
541
+ "よみほん": 121
542
+ },
543
+ "古本": {
544
+ "ふるほん": 550,
545
+ "こほん": 109
546
+ },
547
+ "町家": {
548
+ "まちや": 655,
549
+ "ちょうか": 216
550
+ },
551
+ "米": {
552
+ "べい": 17392,
553
+ "こめ": 9021,
554
+ "まい": 2829,
555
+ "よね": 620,
556
+ "ごめ": 164,
557
+ "めーとる": 112
558
+ }
559
+ }
config/heteronyms_Sato2022.json ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "heteronyms_in_bert": {
3
+ "表": 2,
4
+ "角": 4,
5
+ "大分": 2,
6
+ "国立": 2,
7
+ "人気": 3,
8
+ "市場": 2,
9
+ "気質": 2,
10
+ "役所": 2,
11
+ "上方": 2,
12
+ "上手": 3,
13
+ "下手": 3,
14
+ "人事": 2,
15
+ "金星": 2,
16
+ "仮名": 2,
17
+ "内面": 2,
18
+ "礼拝": 2,
19
+ "遺言": 3,
20
+ "口腔": 2,
21
+ "後世": 2,
22
+ "骨": 2,
23
+ "一途": 2,
24
+ "一言": 3,
25
+ "最中": 3,
26
+ "一目": 2,
27
+ "係": 3,
28
+ "足跡": 2,
29
+ "今日": 2,
30
+ "明日": 3,
31
+ "生物": 3,
32
+ "変化": 2,
33
+ "大事": 2,
34
+ "水車": 2,
35
+ "一見": 2,
36
+ "一端": 2,
37
+ "大家": 3,
38
+ "心中": 2,
39
+ "書物": 2,
40
+ "一角": 2,
41
+ "一行": 3,
42
+ "一時": 3,
43
+ "一定": 2,
44
+ "一方": 2,
45
+ "一夜": 2,
46
+ "下野": 3,
47
+ "化学": 2,
48
+ "火口": 2,
49
+ "花弁": 2,
50
+ "玩具": 2,
51
+ "強力": 3,
52
+ "金色": 2,
53
+ "経緯": 2,
54
+ "故郷": 2,
55
+ "紅葉": 2,
56
+ "行方": 3,
57
+ "根本": 2,
58
+ "左右": 3,
59
+ "山陰": 2,
60
+ "十分": 2,
61
+ "上下": 5,
62
+ "身体": 2,
63
+ "水面": 2,
64
+ "世論": 2,
65
+ "清水": 3,
66
+ "大手": 2,
67
+ "大人": 4,
68
+ "大勢": 3,
69
+ "中間": 5,
70
+ "日向": 42,
71
+ "日時": 3,
72
+ "夫婦": 2,
73
+ "牧場": 2,
74
+ "末期": 2,
75
+ "利益": 2,
76
+ "工夫": 2,
77
+ "一味": 2,
78
+ "魚": 3,
79
+ "区分": 2,
80
+ "施行": 4,
81
+ "施工": 2,
82
+ "転生": 2,
83
+ "博士": 2,
84
+ "法華": 2,
85
+ "真面目": 3,
86
+ "眼鏡": 2,
87
+ "文字": 2,
88
+ "文書": 3,
89
+ "律令": 2,
90
+ "現世": 2,
91
+ "日中": 2,
92
+ "夜中": 3,
93
+ "前世": 2,
94
+ "二人": 2,
95
+ "立像": 2
96
+ },
97
+ "heteronyms_not_in_bert": {
98
+ "教化": 3,
99
+ "見物": 2,
100
+ "清浄": 2,
101
+ "谷間": 2,
102
+ "追従": 2,
103
+ "墓石": 2,
104
+ "大文字": 2,
105
+ "漢書": 2,
106
+ "作法": 2,
107
+ "兵法": 2,
108
+ "大人気": 2,
109
+ "半月": 2,
110
+ "黒子": 2,
111
+ "外面": 2,
112
+ "競売": 2,
113
+ "開眼": 2,
114
+ "求道": 2,
115
+ "血脈": 2,
116
+ "施業": 2,
117
+ "借家": 2,
118
+ "頭蓋骨": 2,
119
+ "法衣": 2,
120
+ "昨日": 2,
121
+ "氷柱": 2,
122
+ "風車": 2,
123
+ "寒気": 2,
124
+ "背筋": 2,
125
+ "逆手": 2,
126
+ "色紙": 2,
127
+ "生花": 3,
128
+ "白髪": 2,
129
+ "貼付": 2,
130
+ "一回": 2,
131
+ "一期": 2,
132
+ "一月": 3,
133
+ "一所": 2,
134
+ "一寸": 2,
135
+ "一声": 2,
136
+ "一石": 2,
137
+ "一日": 4,
138
+ "一分": 3,
139
+ "一文": 3,
140
+ "一片": 3,
141
+ "何時": 3,
142
+ "何分": 2,
143
+ "火煙": 2,
144
+ "火傷": 2,
145
+ "火床": 3,
146
+ "火先": 2,
147
+ "火筒": 2,
148
+ "芥子": 3,
149
+ "気骨": 2,
150
+ "銀杏": 3,
151
+ "元金": 2,
152
+ "五分": 2,
153
+ "後々": 2,
154
+ "後生": 2,
155
+ "御供": 4,
156
+ "細々": 3,
157
+ "細目": 2,
158
+ "三位": 2,
159
+ "疾風": 3,
160
+ "菖蒲": 2,
161
+ "世人": 2,
162
+ "世路": 2,
163
+ "船底": 2,
164
+ "早急": 2,
165
+ "相乗": 2,
166
+ "造作": 2,
167
+ "他言": 2,
168
+ "東雲": 2,
169
+ "頭数": 2,
170
+ "二重": 2,
171
+ "日供": 2,
172
+ "日次": 4,
173
+ "日暮": 3,
174
+ "日来": 3,
175
+ "梅雨": 2,
176
+ "風穴": 2,
177
+ "仏語": 3,
178
+ "分別": 2,
179
+ "面子": 2,
180
+ "木目": 2,
181
+ "目下": 2,
182
+ "夜直": 2,
183
+ "夜来": 2,
184
+ "夜話": 2,
185
+ "野兎": 2,
186
+ "野馬": 3,
187
+ "野分": 2,
188
+ "野辺": 2,
189
+ "野面": 3,
190
+ "野立": 3,
191
+ "冷水": 2,
192
+ "連中": 2,
193
+ "飛沫": 2,
194
+ "翡翠": 2,
195
+ "餃子": 2,
196
+ "一足": 2,
197
+ "意気地": 2,
198
+ "一昨日": 3,
199
+ "一昨年": 2,
200
+ "十八番": 2,
201
+ "十六夜": 2,
202
+ "明後日": 2,
203
+ "石綿": 2,
204
+ "公文": 2,
205
+ "読本": 3,
206
+ "仏国": 3,
207
+ "古本": 2,
208
+ "町家": 2,
209
+ "遊行": 2
210
+ }
211
+ }
pyproject.toml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "yomikata"
7
+ version = "0.0.1"
8
+ authors = [{name="Sam Passaglia"}]
9
+ description = "Japanese kanji disambiguation"
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+ classifiers = [
13
+ "Programming Language :: Python :: 3",
14
+ "Operating System :: OS Independent",
15
+ "License:: OSI Approved :: MIT License"
16
+ ]
17
+ dynamic = ["dependencies"]
18
+
19
+ [project.urls]
20
+ "Homepage" = "https://github.com/passaglia/yomikata"
21
+ "Demo" = "https://huggingface.co/spaces/passaglia/yomikata"
22
+ "Bug Tracker" = "https://github.com/passaglia/yomikata/issues"
23
+
24
+ [tool.setuptools.dynamic]
25
+ dependencies = {file = ["requirements.txt"]}
26
+
27
+ [tool.setuptools]
28
+ packages = ["yomikata", "config"]
29
+
30
+ [tool.flake8]
31
+ exclude = "venv"
32
+ ignore = ["E203","E501", "W503", "E226"]
33
+ max-line-length = 79
34
+ # E501: Line too long
35
+ # W503: Line break occurred before binary operator
36
+ # E226: Missing white space around arithmetic operator
37
+ # E203: whitespace before ':' ()
38
+
39
+ # iSort
40
+ [tool.isort]
41
+ profile = "black"
42
+ line_length = 79
43
+ multi_line_output = 3
44
+ include_trailing_comma = true
45
+ virtual_env = "venv"
46
+
47
+ # Black formatting
48
+ [tool.black]
49
+ line-length = 100
50
+ include = '\.pyi?$'
51
+ exclude = '''
52
+ /(
53
+ .eggs # exclude a few common directories
54
+ | .git # in the root of the project
55
+ | .hg
56
+ | .mypy_cache
57
+ | .tox
58
+ | venv
59
+ | _build
60
+ | buck-out
61
+ | build
62
+ | dist
63
+ )/
64
+ '''
65
+
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.0
2
+ pandas==1.5.2
3
+ pretty-errors==1.2.25
4
+ fugashi==1.2.1
5
+ ipadic==1.0.0
6
+ jumandic==1.0.0
7
+ jaconv==0.3
8
+ fugashi[unidic] #python -m unidic download
9
+ sudachidict_full
10
+ scikit-learn==1.2.0
11
+ speach==0.1a15.post1
12
+ torch==1.13.1
13
+ transformers==4.25.1
14
+ datasets==2.7.1
15
+ pynvml==11.4.1
16
+ sentencepiece==0.1.97
17
+ typer==0.7.0
18
+ rich==12.6.0
19
+ unidic-lite
20
+ japanize_matplotlib
21
+ mlflow-skinny==2.1.1
22
+ streamlit==1.18.1
23
+ black
24
+ flake8
25
+ isort
robot_reading.png ADDED
stores/dbert/added_tokens.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "一分": 32813,
3
+ "一声": 32824,
4
+ "一寸": 32779,
5
+ "一文": 32798,
6
+ "一日": 32791,
7
+ "一昨年": 32825,
8
+ "一昨日": 32822,
9
+ "一月": 32783,
10
+ "二重": 32782,
11
+ "何分": 32772,
12
+ "何時": 32773,
13
+ "作法": 32816,
14
+ "借家": 32819,
15
+ "公文": 32780,
16
+ "冷水": 32796,
17
+ "分別": 32827,
18
+ "十八番": 32810,
19
+ "半月": 32801,
20
+ "古本": 32805,
21
+ "墓石": 32814,
22
+ "夜話": 32806,
23
+ "大文字": 32774,
24
+ "寒気": 32804,
25
+ "施業": 32775,
26
+ "日暮": 32786,
27
+ "明後日": 32808,
28
+ "昨日": 32788,
29
+ "梅雨": 32803,
30
+ "気骨": 32777,
31
+ "求道": 32784,
32
+ "法衣": 32821,
33
+ "清浄": 32785,
34
+ "漢書": 32776,
35
+ "生花": 32811,
36
+ "町家": 32797,
37
+ "疾風": 32789,
38
+ "白髪": 32794,
39
+ "相乗": 32809,
40
+ "石綿": 32781,
41
+ "競売": 32799,
42
+ "細々": 32769,
43
+ "細目": 32815,
44
+ "翡翠": 32826,
45
+ "背筋": 32823,
46
+ "船底": 32812,
47
+ "菖蒲": 32820,
48
+ "見物": 32829,
49
+ "読本": 32795,
50
+ "谷間": 32800,
51
+ "追従": 32828,
52
+ "逆手": 32778,
53
+ "造作": 32818,
54
+ "連中": 32770,
55
+ "野兎": 32807,
56
+ "銀杏": 32768,
57
+ "開眼": 32790,
58
+ "頭数": 32792,
59
+ "頭蓋骨": 32817,
60
+ "風穴": 32802,
61
+ "風車": 32793,
62
+ "飛沫": 32787,
63
+ "黒子": 32771
64
+ }
stores/dbert/config.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-tohoku/bert-base-japanese-v2",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4",
17
+ "5": "LABEL_5",
18
+ "6": "LABEL_6",
19
+ "7": "LABEL_7",
20
+ "8": "LABEL_8",
21
+ "9": "LABEL_9",
22
+ "10": "LABEL_10",
23
+ "11": "LABEL_11",
24
+ "12": "LABEL_12",
25
+ "13": "LABEL_13",
26
+ "14": "LABEL_14",
27
+ "15": "LABEL_15",
28
+ "16": "LABEL_16",
29
+ "17": "LABEL_17",
30
+ "18": "LABEL_18",
31
+ "19": "LABEL_19",
32
+ "20": "LABEL_20",
33
+ "21": "LABEL_21",
34
+ "22": "LABEL_22",
35
+ "23": "LABEL_23",
36
+ "24": "LABEL_24",
37
+ "25": "LABEL_25",
38
+ "26": "LABEL_26",
39
+ "27": "LABEL_27",
40
+ "28": "LABEL_28",
41
+ "29": "LABEL_29",
42
+ "30": "LABEL_30",
43
+ "31": "LABEL_31",
44
+ "32": "LABEL_32",
45
+ "33": "LABEL_33",
46
+ "34": "LABEL_34",
47
+ "35": "LABEL_35",
48
+ "36": "LABEL_36",
49
+ "37": "LABEL_37",
50
+ "38": "LABEL_38",
51
+ "39": "LABEL_39",
52
+ "40": "LABEL_40",
53
+ "41": "LABEL_41",
54
+ "42": "LABEL_42",
55
+ "43": "LABEL_43",
56
+ "44": "LABEL_44",
57
+ "45": "LABEL_45",
58
+ "46": "LABEL_46",
59
+ "47": "LABEL_47",
60
+ "48": "LABEL_48",
61
+ "49": "LABEL_49",
62
+ "50": "LABEL_50",
63
+ "51": "LABEL_51",
64
+ "52": "LABEL_52",
65
+ "53": "LABEL_53",
66
+ "54": "LABEL_54",
67
+ "55": "LABEL_55",
68
+ "56": "LABEL_56",
69
+ "57": "LABEL_57",
70
+ "58": "LABEL_58",
71
+ "59": "LABEL_59",
72
+ "60": "LABEL_60",
73
+ "61": "LABEL_61",
74
+ "62": "LABEL_62",
75
+ "63": "LABEL_63",
76
+ "64": "LABEL_64",
77
+ "65": "LABEL_65",
78
+ "66": "LABEL_66",
79
+ "67": "LABEL_67",
80
+ "68": "LABEL_68",
81
+ "69": "LABEL_69",
82
+ "70": "LABEL_70",
83
+ "71": "LABEL_71",
84
+ "72": "LABEL_72",
85
+ "73": "LABEL_73",
86
+ "74": "LABEL_74",
87
+ "75": "LABEL_75",
88
+ "76": "LABEL_76",
89
+ "77": "LABEL_77",
90
+ "78": "LABEL_78",
91
+ "79": "LABEL_79",
92
+ "80": "LABEL_80",
93
+ "81": "LABEL_81",
94
+ "82": "LABEL_82",
95
+ "83": "LABEL_83",
96
+ "84": "LABEL_84",
97
+ "85": "LABEL_85",
98
+ "86": "LABEL_86",
99
+ "87": "LABEL_87",
100
+ "88": "LABEL_88",
101
+ "89": "LABEL_89",
102
+ "90": "LABEL_90",
103
+ "91": "LABEL_91",
104
+ "92": "LABEL_92",
105
+ "93": "LABEL_93",
106
+ "94": "LABEL_94",
107
+ "95": "LABEL_95",
108
+ "96": "LABEL_96",
109
+ "97": "LABEL_97",
110
+ "98": "LABEL_98",
111
+ "99": "LABEL_99",
112
+ "100": "LABEL_100",
113
+ "101": "LABEL_101",
114
+ "102": "LABEL_102",
115
+ "103": "LABEL_103",
116
+ "104": "LABEL_104",
117
+ "105": "LABEL_105",
118
+ "106": "LABEL_106",
119
+ "107": "LABEL_107",
120
+ "108": "LABEL_108",
121
+ "109": "LABEL_109",
122
+ "110": "LABEL_110",
123
+ "111": "LABEL_111",
124
+ "112": "LABEL_112",
125
+ "113": "LABEL_113",
126
+ "114": "LABEL_114",
127
+ "115": "LABEL_115",
128
+ "116": "LABEL_116",
129
+ "117": "LABEL_117",
130
+ "118": "LABEL_118",
131
+ "119": "LABEL_119",
132
+ "120": "LABEL_120",
133
+ "121": "LABEL_121",
134
+ "122": "LABEL_122",
135
+ "123": "LABEL_123",
136
+ "124": "LABEL_124",
137
+ "125": "LABEL_125",
138
+ "126": "LABEL_126",
139
+ "127": "LABEL_127",
140
+ "128": "LABEL_128",
141
+ "129": "LABEL_129",
142
+ "130": "LABEL_130",
143
+ "131": "LABEL_131",
144
+ "132": "LABEL_132",
145
+ "133": "LABEL_133",
146
+ "134": "LABEL_134",
147
+ "135": "LABEL_135",
148
+ "136": "LABEL_136",
149
+ "137": "LABEL_137",
150
+ "138": "LABEL_138",
151
+ "139": "LABEL_139",
152
+ "140": "LABEL_140",
153
+ "141": "LABEL_141",
154
+ "142": "LABEL_142",
155
+ "143": "LABEL_143",
156
+ "144": "LABEL_144",
157
+ "145": "LABEL_145",
158
+ "146": "LABEL_146",
159
+ "147": "LABEL_147",
160
+ "148": "LABEL_148",
161
+ "149": "LABEL_149",
162
+ "150": "LABEL_150",
163
+ "151": "LABEL_151",
164
+ "152": "LABEL_152",
165
+ "153": "LABEL_153",
166
+ "154": "LABEL_154",
167
+ "155": "LABEL_155",
168
+ "156": "LABEL_156",
169
+ "157": "LABEL_157",
170
+ "158": "LABEL_158",
171
+ "159": "LABEL_159",
172
+ "160": "LABEL_160",
173
+ "161": "LABEL_161",
174
+ "162": "LABEL_162",
175
+ "163": "LABEL_163",
176
+ "164": "LABEL_164",
177
+ "165": "LABEL_165",
178
+ "166": "LABEL_166",
179
+ "167": "LABEL_167",
180
+ "168": "LABEL_168",
181
+ "169": "LABEL_169",
182
+ "170": "LABEL_170",
183
+ "171": "LABEL_171",
184
+ "172": "LABEL_172",
185
+ "173": "LABEL_173",
186
+ "174": "LABEL_174",
187
+ "175": "LABEL_175",
188
+ "176": "LABEL_176",
189
+ "177": "LABEL_177",
190
+ "178": "LABEL_178",
191
+ "179": "LABEL_179",
192
+ "180": "LABEL_180",
193
+ "181": "LABEL_181",
194
+ "182": "LABEL_182",
195
+ "183": "LABEL_183",
196
+ "184": "LABEL_184",
197
+ "185": "LABEL_185",
198
+ "186": "LABEL_186",
199
+ "187": "LABEL_187",
200
+ "188": "LABEL_188",
201
+ "189": "LABEL_189",
202
+ "190": "LABEL_190",
203
+ "191": "LABEL_191",
204
+ "192": "LABEL_192",
205
+ "193": "LABEL_193",
206
+ "194": "LABEL_194",
207
+ "195": "LABEL_195",
208
+ "196": "LABEL_196",
209
+ "197": "LABEL_197",
210
+ "198": "LABEL_198",
211
+ "199": "LABEL_199",
212
+ "200": "LABEL_200",
213
+ "201": "LABEL_201",
214
+ "202": "LABEL_202",
215
+ "203": "LABEL_203",
216
+ "204": "LABEL_204",
217
+ "205": "LABEL_205",
218
+ "206": "LABEL_206",
219
+ "207": "LABEL_207",
220
+ "208": "LABEL_208",
221
+ "209": "LABEL_209",
222
+ "210": "LABEL_210",
223
+ "211": "LABEL_211",
224
+ "212": "LABEL_212",
225
+ "213": "LABEL_213",
226
+ "214": "LABEL_214",
227
+ "215": "LABEL_215",
228
+ "216": "LABEL_216",
229
+ "217": "LABEL_217",
230
+ "218": "LABEL_218",
231
+ "219": "LABEL_219",
232
+ "220": "LABEL_220",
233
+ "221": "LABEL_221",
234
+ "222": "LABEL_222",
235
+ "223": "LABEL_223",
236
+ "224": "LABEL_224",
237
+ "225": "LABEL_225",
238
+ "226": "LABEL_226",
239
+ "227": "LABEL_227",
240
+ "228": "LABEL_228",
241
+ "229": "LABEL_229",
242
+ "230": "LABEL_230",
243
+ "231": "LABEL_231",
244
+ "232": "LABEL_232",
245
+ "233": "LABEL_233",
246
+ "234": "LABEL_234",
247
+ "235": "LABEL_235",
248
+ "236": "LABEL_236",
249
+ "237": "LABEL_237",
250
+ "238": "LABEL_238",
251
+ "239": "LABEL_239",
252
+ "240": "LABEL_240",
253
+ "241": "LABEL_241",
254
+ "242": "LABEL_242",
255
+ "243": "LABEL_243",
256
+ "244": "LABEL_244",
257
+ "245": "LABEL_245",
258
+ "246": "LABEL_246",
259
+ "247": "LABEL_247",
260
+ "248": "LABEL_248",
261
+ "249": "LABEL_249",
262
+ "250": "LABEL_250",
263
+ "251": "LABEL_251",
264
+ "252": "LABEL_252",
265
+ "253": "LABEL_253",
266
+ "254": "LABEL_254",
267
+ "255": "LABEL_255",
268
+ "256": "LABEL_256",
269
+ "257": "LABEL_257",
270
+ "258": "LABEL_258",
271
+ "259": "LABEL_259",
272
+ "260": "LABEL_260",
273
+ "261": "LABEL_261",
274
+ "262": "LABEL_262",
275
+ "263": "LABEL_263",
276
+ "264": "LABEL_264",
277
+ "265": "LABEL_265",
278
+ "266": "LABEL_266",
279
+ "267": "LABEL_267",
280
+ "268": "LABEL_268",
281
+ "269": "LABEL_269",
282
+ "270": "LABEL_270",
283
+ "271": "LABEL_271",
284
+ "272": "LABEL_272",
285
+ "273": "LABEL_273",
286
+ "274": "LABEL_274",
287
+ "275": "LABEL_275",
288
+ "276": "LABEL_276",
289
+ "277": "LABEL_277",
290
+ "278": "LABEL_278",
291
+ "279": "LABEL_279",
292
+ "280": "LABEL_280",
293
+ "281": "LABEL_281",
294
+ "282": "LABEL_282",
295
+ "283": "LABEL_283",
296
+ "284": "LABEL_284",
297
+ "285": "LABEL_285",
298
+ "286": "LABEL_286",
299
+ "287": "LABEL_287",
300
+ "288": "LABEL_288",
301
+ "289": "LABEL_289",
302
+ "290": "LABEL_290",
303
+ "291": "LABEL_291",
304
+ "292": "LABEL_292",
305
+ "293": "LABEL_293",
306
+ "294": "LABEL_294",
307
+ "295": "LABEL_295",
308
+ "296": "LABEL_296",
309
+ "297": "LABEL_297",
310
+ "298": "LABEL_298",
311
+ "299": "LABEL_299",
312
+ "300": "LABEL_300",
313
+ "301": "LABEL_301"
314
+ },
315
+ "initializer_range": 0.02,
316
+ "intermediate_size": 3072,
317
+ "label2id": {
318
+ "LABEL_0": 0,
319
+ "LABEL_1": 1,
320
+ "LABEL_10": 10,
321
+ "LABEL_100": 100,
322
+ "LABEL_101": 101,
323
+ "LABEL_102": 102,
324
+ "LABEL_103": 103,
325
+ "LABEL_104": 104,
326
+ "LABEL_105": 105,
327
+ "LABEL_106": 106,
328
+ "LABEL_107": 107,
329
+ "LABEL_108": 108,
330
+ "LABEL_109": 109,
331
+ "LABEL_11": 11,
332
+ "LABEL_110": 110,
333
+ "LABEL_111": 111,
334
+ "LABEL_112": 112,
335
+ "LABEL_113": 113,
336
+ "LABEL_114": 114,
337
+ "LABEL_115": 115,
338
+ "LABEL_116": 116,
339
+ "LABEL_117": 117,
340
+ "LABEL_118": 118,
341
+ "LABEL_119": 119,
342
+ "LABEL_12": 12,
343
+ "LABEL_120": 120,
344
+ "LABEL_121": 121,
345
+ "LABEL_122": 122,
346
+ "LABEL_123": 123,
347
+ "LABEL_124": 124,
348
+ "LABEL_125": 125,
349
+ "LABEL_126": 126,
350
+ "LABEL_127": 127,
351
+ "LABEL_128": 128,
352
+ "LABEL_129": 129,
353
+ "LABEL_13": 13,
354
+ "LABEL_130": 130,
355
+ "LABEL_131": 131,
356
+ "LABEL_132": 132,
357
+ "LABEL_133": 133,
358
+ "LABEL_134": 134,
359
+ "LABEL_135": 135,
360
+ "LABEL_136": 136,
361
+ "LABEL_137": 137,
362
+ "LABEL_138": 138,
363
+ "LABEL_139": 139,
364
+ "LABEL_14": 14,
365
+ "LABEL_140": 140,
366
+ "LABEL_141": 141,
367
+ "LABEL_142": 142,
368
+ "LABEL_143": 143,
369
+ "LABEL_144": 144,
370
+ "LABEL_145": 145,
371
+ "LABEL_146": 146,
372
+ "LABEL_147": 147,
373
+ "LABEL_148": 148,
374
+ "LABEL_149": 149,
375
+ "LABEL_15": 15,
376
+ "LABEL_150": 150,
377
+ "LABEL_151": 151,
378
+ "LABEL_152": 152,
379
+ "LABEL_153": 153,
380
+ "LABEL_154": 154,
381
+ "LABEL_155": 155,
382
+ "LABEL_156": 156,
383
+ "LABEL_157": 157,
384
+ "LABEL_158": 158,
385
+ "LABEL_159": 159,
386
+ "LABEL_16": 16,
387
+ "LABEL_160": 160,
388
+ "LABEL_161": 161,
389
+ "LABEL_162": 162,
390
+ "LABEL_163": 163,
391
+ "LABEL_164": 164,
392
+ "LABEL_165": 165,
393
+ "LABEL_166": 166,
394
+ "LABEL_167": 167,
395
+ "LABEL_168": 168,
396
+ "LABEL_169": 169,
397
+ "LABEL_17": 17,
398
+ "LABEL_170": 170,
399
+ "LABEL_171": 171,
400
+ "LABEL_172": 172,
401
+ "LABEL_173": 173,
402
+ "LABEL_174": 174,
403
+ "LABEL_175": 175,
404
+ "LABEL_176": 176,
405
+ "LABEL_177": 177,
406
+ "LABEL_178": 178,
407
+ "LABEL_179": 179,
408
+ "LABEL_18": 18,
409
+ "LABEL_180": 180,
410
+ "LABEL_181": 181,
411
+ "LABEL_182": 182,
412
+ "LABEL_183": 183,
413
+ "LABEL_184": 184,
414
+ "LABEL_185": 185,
415
+ "LABEL_186": 186,
416
+ "LABEL_187": 187,
417
+ "LABEL_188": 188,
418
+ "LABEL_189": 189,
419
+ "LABEL_19": 19,
420
+ "LABEL_190": 190,
421
+ "LABEL_191": 191,
422
+ "LABEL_192": 192,
423
+ "LABEL_193": 193,
424
+ "LABEL_194": 194,
425
+ "LABEL_195": 195,
426
+ "LABEL_196": 196,
427
+ "LABEL_197": 197,
428
+ "LABEL_198": 198,
429
+ "LABEL_199": 199,
430
+ "LABEL_2": 2,
431
+ "LABEL_20": 20,
432
+ "LABEL_200": 200,
433
+ "LABEL_201": 201,
434
+ "LABEL_202": 202,
435
+ "LABEL_203": 203,
436
+ "LABEL_204": 204,
437
+ "LABEL_205": 205,
438
+ "LABEL_206": 206,
439
+ "LABEL_207": 207,
440
+ "LABEL_208": 208,
441
+ "LABEL_209": 209,
442
+ "LABEL_21": 21,
443
+ "LABEL_210": 210,
444
+ "LABEL_211": 211,
445
+ "LABEL_212": 212,
446
+ "LABEL_213": 213,
447
+ "LABEL_214": 214,
448
+ "LABEL_215": 215,
449
+ "LABEL_216": 216,
450
+ "LABEL_217": 217,
451
+ "LABEL_218": 218,
452
+ "LABEL_219": 219,
453
+ "LABEL_22": 22,
454
+ "LABEL_220": 220,
455
+ "LABEL_221": 221,
456
+ "LABEL_222": 222,
457
+ "LABEL_223": 223,
458
+ "LABEL_224": 224,
459
+ "LABEL_225": 225,
460
+ "LABEL_226": 226,
461
+ "LABEL_227": 227,
462
+ "LABEL_228": 228,
463
+ "LABEL_229": 229,
464
+ "LABEL_23": 23,
465
+ "LABEL_230": 230,
466
+ "LABEL_231": 231,
467
+ "LABEL_232": 232,
468
+ "LABEL_233": 233,
469
+ "LABEL_234": 234,
470
+ "LABEL_235": 235,
471
+ "LABEL_236": 236,
472
+ "LABEL_237": 237,
473
+ "LABEL_238": 238,
474
+ "LABEL_239": 239,
475
+ "LABEL_24": 24,
476
+ "LABEL_240": 240,
477
+ "LABEL_241": 241,
478
+ "LABEL_242": 242,
479
+ "LABEL_243": 243,
480
+ "LABEL_244": 244,
481
+ "LABEL_245": 245,
482
+ "LABEL_246": 246,
483
+ "LABEL_247": 247,
484
+ "LABEL_248": 248,
485
+ "LABEL_249": 249,
486
+ "LABEL_25": 25,
487
+ "LABEL_250": 250,
488
+ "LABEL_251": 251,
489
+ "LABEL_252": 252,
490
+ "LABEL_253": 253,
491
+ "LABEL_254": 254,
492
+ "LABEL_255": 255,
493
+ "LABEL_256": 256,
494
+ "LABEL_257": 257,
495
+ "LABEL_258": 258,
496
+ "LABEL_259": 259,
497
+ "LABEL_26": 26,
498
+ "LABEL_260": 260,
499
+ "LABEL_261": 261,
500
+ "LABEL_262": 262,
501
+ "LABEL_263": 263,
502
+ "LABEL_264": 264,
503
+ "LABEL_265": 265,
504
+ "LABEL_266": 266,
505
+ "LABEL_267": 267,
506
+ "LABEL_268": 268,
507
+ "LABEL_269": 269,
508
+ "LABEL_27": 27,
509
+ "LABEL_270": 270,
510
+ "LABEL_271": 271,
511
+ "LABEL_272": 272,
512
+ "LABEL_273": 273,
513
+ "LABEL_274": 274,
514
+ "LABEL_275": 275,
515
+ "LABEL_276": 276,
516
+ "LABEL_277": 277,
517
+ "LABEL_278": 278,
518
+ "LABEL_279": 279,
519
+ "LABEL_28": 28,
520
+ "LABEL_280": 280,
521
+ "LABEL_281": 281,
522
+ "LABEL_282": 282,
523
+ "LABEL_283": 283,
524
+ "LABEL_284": 284,
525
+ "LABEL_285": 285,
526
+ "LABEL_286": 286,
527
+ "LABEL_287": 287,
528
+ "LABEL_288": 288,
529
+ "LABEL_289": 289,
530
+ "LABEL_29": 29,
531
+ "LABEL_290": 290,
532
+ "LABEL_291": 291,
533
+ "LABEL_292": 292,
534
+ "LABEL_293": 293,
535
+ "LABEL_294": 294,
536
+ "LABEL_295": 295,
537
+ "LABEL_296": 296,
538
+ "LABEL_297": 297,
539
+ "LABEL_298": 298,
540
+ "LABEL_299": 299,
541
+ "LABEL_3": 3,
542
+ "LABEL_30": 30,
543
+ "LABEL_300": 300,
544
+ "LABEL_301": 301,
545
+ "LABEL_31": 31,
546
+ "LABEL_32": 32,
547
+ "LABEL_33": 33,
548
+ "LABEL_34": 34,
549
+ "LABEL_35": 35,
550
+ "LABEL_36": 36,
551
+ "LABEL_37": 37,
552
+ "LABEL_38": 38,
553
+ "LABEL_39": 39,
554
+ "LABEL_4": 4,
555
+ "LABEL_40": 40,
556
+ "LABEL_41": 41,
557
+ "LABEL_42": 42,
558
+ "LABEL_43": 43,
559
+ "LABEL_44": 44,
560
+ "LABEL_45": 45,
561
+ "LABEL_46": 46,
562
+ "LABEL_47": 47,
563
+ "LABEL_48": 48,
564
+ "LABEL_49": 49,
565
+ "LABEL_5": 5,
566
+ "LABEL_50": 50,
567
+ "LABEL_51": 51,
568
+ "LABEL_52": 52,
569
+ "LABEL_53": 53,
570
+ "LABEL_54": 54,
571
+ "LABEL_55": 55,
572
+ "LABEL_56": 56,
573
+ "LABEL_57": 57,
574
+ "LABEL_58": 58,
575
+ "LABEL_59": 59,
576
+ "LABEL_6": 6,
577
+ "LABEL_60": 60,
578
+ "LABEL_61": 61,
579
+ "LABEL_62": 62,
580
+ "LABEL_63": 63,
581
+ "LABEL_64": 64,
582
+ "LABEL_65": 65,
583
+ "LABEL_66": 66,
584
+ "LABEL_67": 67,
585
+ "LABEL_68": 68,
586
+ "LABEL_69": 69,
587
+ "LABEL_7": 7,
588
+ "LABEL_70": 70,
589
+ "LABEL_71": 71,
590
+ "LABEL_72": 72,
591
+ "LABEL_73": 73,
592
+ "LABEL_74": 74,
593
+ "LABEL_75": 75,
594
+ "LABEL_76": 76,
595
+ "LABEL_77": 77,
596
+ "LABEL_78": 78,
597
+ "LABEL_79": 79,
598
+ "LABEL_8": 8,
599
+ "LABEL_80": 80,
600
+ "LABEL_81": 81,
601
+ "LABEL_82": 82,
602
+ "LABEL_83": 83,
603
+ "LABEL_84": 84,
604
+ "LABEL_85": 85,
605
+ "LABEL_86": 86,
606
+ "LABEL_87": 87,
607
+ "LABEL_88": 88,
608
+ "LABEL_89": 89,
609
+ "LABEL_9": 9,
610
+ "LABEL_90": 90,
611
+ "LABEL_91": 91,
612
+ "LABEL_92": 92,
613
+ "LABEL_93": 93,
614
+ "LABEL_94": 94,
615
+ "LABEL_95": 95,
616
+ "LABEL_96": 96,
617
+ "LABEL_97": 97,
618
+ "LABEL_98": 98,
619
+ "LABEL_99": 99
620
+ },
621
+ "layer_norm_eps": 1e-12,
622
+ "max_position_embeddings": 512,
623
+ "model_type": "bert",
624
+ "num_attention_heads": 12,
625
+ "num_hidden_layers": 12,
626
+ "pad_token_id": 0,
627
+ "position_embedding_type": "absolute",
628
+ "tokenizer_class": "BertJapaneseTokenizer",
629
+ "torch_dtype": "float32",
630
+ "transformers_version": "4.25.1",
631
+ "type_vocab_size": 2,
632
+ "use_cache": true,
633
+ "vocab_size": 32830
634
+ }
stores/dbert/heteronyms.json ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "表": {
3
+ "ひょう": 3349,
4
+ "おもて": 3034,
5
+ "あらわ": 2474,
6
+ "あら": 731
7
+ },
8
+ "角": {
9
+ "かく": 4360,
10
+ "かど": 2303,
11
+ "つの": 372,
12
+ "すみ": 70
13
+ },
14
+ "大分": {
15
+ "おおいた": 3358,
16
+ "だいぶ": 797,
17
+ "だいぶん": 97
18
+ },
19
+ "国立": {
20
+ "こくりつ": 19256,
21
+ "くにたち": 246
22
+ },
23
+ "人気": {
24
+ "にんき": 7383,
25
+ "ひとけ": 149,
26
+ "じんき": 44
27
+ },
28
+ "市場": {
29
+ "しじょう": 85107,
30
+ "いちば": 781
31
+ },
32
+ "気質": {
33
+ "きしつ": 1108,
34
+ "かたぎ": 398
35
+ },
36
+ "上方": {
37
+ "かみがた": 1411,
38
+ "じょうほう": 656
39
+ },
40
+ "上手": {
41
+ "じょうず": 8065,
42
+ "うま": 706,
43
+ "かみて": 150,
44
+ "うわて": 57
45
+ },
46
+ "下手": {
47
+ "へた": 849,
48
+ "したて": 128,
49
+ "べた": 121,
50
+ "しもて": 50
51
+ },
52
+ "仮名": {
53
+ "かな": 1407,
54
+ "がな": 129,
55
+ "かめい": 115
56
+ },
57
+ "礼拝": {
58
+ "れいはい": 841,
59
+ "らいはい": 62
60
+ },
61
+ "遺言": {
62
+ "ゆいごん": 3152,
63
+ "いげん": 67,
64
+ "いごん": 57
65
+ },
66
+ "口腔": {
67
+ "こうこう": 6475,
68
+ "こうくう": 5577
69
+ },
70
+ "骨": {
71
+ "ほね": 10697,
72
+ "こつ": 5870
73
+ },
74
+ "一途": {
75
+ "いちず": 576,
76
+ "いっと": 139
77
+ },
78
+ "一言": {
79
+ "ひとこと": 2567,
80
+ "いちげん": 133,
81
+ "いちごん": 106
82
+ },
83
+ "最中": {
84
+ "さいちゅう": 520,
85
+ "さなか": 43
86
+ },
87
+ "一目": {
88
+ "ひとめ": 1596,
89
+ "いちもく": 210
90
+ },
91
+ "係": {
92
+ "かか": 14218,
93
+ "かかわ": 9804,
94
+ "がかり": 234,
95
+ "かかり": 227
96
+ },
97
+ "足跡": {
98
+ "あしあと": 2626,
99
+ "そくせき": 1862
100
+ },
101
+ "今日": {
102
+ "きょう": 17624,
103
+ "こんにち": 6772
104
+ },
105
+ "明日": {
106
+ "あす": 9824,
107
+ "あした": 6606,
108
+ "みょうにち": 66
109
+ },
110
+ "生物": {
111
+ "せいぶつ": 26088,
112
+ "いきもの": 55
113
+ },
114
+ "変化": {
115
+ "へんか": 87895,
116
+ "へんげ": 337
117
+ },
118
+ "大事": {
119
+ "だいじ": 5293,
120
+ "おおごと": 54
121
+ },
122
+ "大家": {
123
+ "たいか": 586,
124
+ "おおや": 238,
125
+ "たいけ": 79
126
+ },
127
+ "心中": {
128
+ "しんじゅう": 1541,
129
+ "しんちゅう": 250,
130
+ "しんぢゅう": 127
131
+ },
132
+ "一行": {
133
+ "いっこう": 1112,
134
+ "いちぎょう": 95
135
+ },
136
+ "一時": {
137
+ "いちじ": 2649,
138
+ "いっとき": 381,
139
+ "いちどき": 47
140
+ },
141
+ "一方": {
142
+ "いっぽう": 5327,
143
+ "ひとかた": 112,
144
+ "いちほう": 42
145
+ },
146
+ "一夜": {
147
+ "いちや": 1148,
148
+ "ひとよ": 82
149
+ },
150
+ "下野": {
151
+ "しもつけ": 530,
152
+ "げや": 104,
153
+ "しもの": 57
154
+ },
155
+ "花弁": {
156
+ "かべん": 213,
157
+ "はなびら": 58
158
+ },
159
+ "玩具": {
160
+ "がんぐ": 1354,
161
+ "おもちゃ": 238
162
+ },
163
+ "強力": {
164
+ "きょうりょく": 2319,
165
+ "ごうりき": 51
166
+ },
167
+ "金色": {
168
+ "きんいろ": 942,
169
+ "こんじき": 484
170
+ },
171
+ "経緯": {
172
+ "けいい": 7659,
173
+ "いきさつ": 56
174
+ },
175
+ "故郷": {
176
+ "こきょう": 3840,
177
+ "ふるさと": 506,
178
+ "くに": 122
179
+ },
180
+ "紅葉": {
181
+ "こうよう": 856,
182
+ "もみじ": 339
183
+ },
184
+ "根本": {
185
+ "こんぽん": 2872,
186
+ "ねもと": 262
187
+ },
188
+ "山陰": {
189
+ "さんいん": 2094,
190
+ "やまかげ": 51
191
+ },
192
+ "上下": {
193
+ "じょうげ": 1549,
194
+ "うえした": 97
195
+ },
196
+ "身体": {
197
+ "しんたい": 20301,
198
+ "からだ": 3375
199
+ },
200
+ "水面": {
201
+ "すいめん": 1387,
202
+ "みなも": 91
203
+ },
204
+ "世論": {
205
+ "よろん": 4554,
206
+ "せろん": 1934
207
+ },
208
+ "清水": {
209
+ "しみず": 4114,
210
+ "きよみず": 98
211
+ },
212
+ "大手": {
213
+ "おおて": 6695,
214
+ "おおで": 119
215
+ },
216
+ "大人": {
217
+ "おとな": 11037,
218
+ "たいじん": 113,
219
+ "うし": 59
220
+ },
221
+ "大勢": {
222
+ "おおぜい": 1290,
223
+ "たいせい": 398
224
+ },
225
+ "中間": {
226
+ "ちゅうかん": 17669,
227
+ "ちゅうげん": 144
228
+ },
229
+ "日向": {
230
+ "ひゅうが": 800,
231
+ "ひなた": 318
232
+ },
233
+ "夫婦": {
234
+ "ふうふ": 9165,
235
+ "めおと": 354
236
+ },
237
+ "牧場": {
238
+ "ぼくじょう": 1913,
239
+ "まきば": 159
240
+ },
241
+ "末期": {
242
+ "まっき": 3569,
243
+ "まつご": 78
244
+ },
245
+ "利益": {
246
+ "りえき": 13434,
247
+ "りやく": 209
248
+ },
249
+ "一味": {
250
+ "いちみ": 442,
251
+ "ひとあじ": 60
252
+ },
253
+ "魚": {
254
+ "さかな": 5857,
255
+ "うお": 1706,
256
+ "ぎょ": 413,
257
+ "ざかな": 50
258
+ },
259
+ "施行": {
260
+ "しこう": 18724,
261
+ "せこう": 70
262
+ },
263
+ "施工": {
264
+ "せこう": 25734,
265
+ "しこう": 48,
266
+ "せこ": 43
267
+ },
268
+ "転生": {
269
+ "てんせい": 911,
270
+ "てんしょう": 175
271
+ },
272
+ "博士": {
273
+ "はくし": 17017,
274
+ "はかせ": 2462
275
+ },
276
+ "眼鏡": {
277
+ "めがね": 2040,
278
+ "がんきょう": 102
279
+ },
280
+ "文字": {
281
+ "もじ": 9583,
282
+ "もんじ": 633
283
+ },
284
+ "文書": {
285
+ "ぶんしょ": 15094,
286
+ "もんじょ": 5879,
287
+ "もんしょ": 51
288
+ },
289
+ "現世": {
290
+ "げんせい": 192,
291
+ "げんせ": 125
292
+ },
293
+ "日中": {
294
+ "にっちゅう": 12478,
295
+ "にちじゅう": 117
296
+ },
297
+ "夜中": {
298
+ "よなか": 723,
299
+ "やちゅう": 106
300
+ },
301
+ "二人": {
302
+ "ふたり": 22151,
303
+ "ににん": 256
304
+ },
305
+ "見物": {
306
+ "けんぶつ": 1832,
307
+ "みもの": 61
308
+ },
309
+ "清浄": {
310
+ "せいじょう": 800,
311
+ "しょうじょう": 46
312
+ },
313
+ "谷間": {
314
+ "たにま": 1089,
315
+ "たにあい": 67
316
+ },
317
+ "追従": {
318
+ "ついじゅう": 1000,
319
+ "ついしょう": 73
320
+ },
321
+ "墓石": {
322
+ "はかいし": 323,
323
+ "ぼせき": 257
324
+ },
325
+ "大文字": {
326
+ "おおもじ": 65,
327
+ "だいもんじ": 46
328
+ },
329
+ "漢書": {
330
+ "かんじょ": 171,
331
+ "かんしょ": 66,
332
+ "からぶみ": 47
333
+ },
334
+ "作法": {
335
+ "さほう": 3905,
336
+ "さくほう": 427
337
+ },
338
+ "半月": {
339
+ "はんつき": 388,
340
+ "はんげつ": 85
341
+ },
342
+ "黒子": {
343
+ "ほくろ": 200,
344
+ "くろこ": 183
345
+ },
346
+ "競売": {
347
+ "けいばい": 937,
348
+ "きょうばい": 332
349
+ },
350
+ "開眼": {
351
+ "かいげん": 338,
352
+ "かいがん": 144
353
+ },
354
+ "求道": {
355
+ "きゅうどう": 379,
356
+ "ぐどう": 81
357
+ },
358
+ "施業": {
359
+ "せぎょう": 602,
360
+ "しぎょう": 264
361
+ },
362
+ "借家": {
363
+ "しゃっか": 505,
364
+ "しゃくや": 394
365
+ },
366
+ "頭蓋骨": {
367
+ "ずがいこつ": 377,
368
+ "とうがいこつ": 187
369
+ },
370
+ "法衣": {
371
+ "ころも": 115,
372
+ "ほうえ": 87
373
+ },
374
+ "昨日": {
375
+ "きのう": 2670,
376
+ "さくじつ": 713
377
+ },
378
+ "風車": {
379
+ "ふうしゃ": 1133,
380
+ "かざぐるま": 678
381
+ },
382
+ "寒気": {
383
+ "かんき": 153,
384
+ "さむけ": 79
385
+ },
386
+ "背筋": {
387
+ "せすじ": 177,
388
+ "はいきん": 43
389
+ },
390
+ "逆手": {
391
+ "さかて": 169,
392
+ "ぎゃくて": 116
393
+ },
394
+ "生花": {
395
+ "いけばな": 283,
396
+ "せいか": 91
397
+ },
398
+ "白髪": {
399
+ "しらが": 313,
400
+ "はくはつ": 113
401
+ },
402
+ "一月": {
403
+ "ひとつき": 301,
404
+ "いちがつ": 282
405
+ },
406
+ "一寸": {
407
+ "ちょっと": 1481,
408
+ "いっすん": 111
409
+ },
410
+ "一声": {
411
+ "ひとこえ": 253,
412
+ "いっせい": 109
413
+ },
414
+ "一日": {
415
+ "いちにち": 1711,
416
+ "ついたち": 866,
417
+ "いちじつ": 41
418
+ },
419
+ "一分": {
420
+ "いちぶん": 75,
421
+ "いちぶ": 62
422
+ },
423
+ "一文": {
424
+ "いちもん": 86,
425
+ "いちぶん": 48
426
+ },
427
+ "何時": {
428
+ "いつ": 1248,
429
+ "なんじ": 159,
430
+ "なんどき": 63
431
+ },
432
+ "何分": {
433
+ "なにぶん": 379,
434
+ "なんぷん": 51
435
+ },
436
+ "気骨": {
437
+ "きこつ": 140,
438
+ "きぼね": 67
439
+ },
440
+ "銀杏": {
441
+ "いちょう": 322,
442
+ "ぎんなん": 85
443
+ },
444
+ "細々": {
445
+ "こまごま": 88,
446
+ "ほそぼそ": 67
447
+ },
448
+ "細目": {
449
+ "さいもく": 962,
450
+ "ほそめ": 123
451
+ },
452
+ "疾風": {
453
+ "しっぷう": 544,
454
+ "はやて": 94,
455
+ "かぜ": 68
456
+ },
457
+ "菖蒲": {
458
+ "しょうぶ": 165,
459
+ "あやめ": 65
460
+ },
461
+ "船底": {
462
+ "せんてい": 246,
463
+ "ふなぞこ": 80
464
+ },
465
+ "相乗": {
466
+ "そうじょう": 732,
467
+ "あいの": 89
468
+ },
469
+ "造作": {
470
+ "ぞうさ": 188,
471
+ "ぞうさく": 65
472
+ },
473
+ "頭数": {
474
+ "あたまかず": 168,
475
+ "とうすう": 119
476
+ },
477
+ "二重": {
478
+ "にじゅう": 5418,
479
+ "ふたえ": 65
480
+ },
481
+ "日暮": {
482
+ "ひぐ": 403,
483
+ "ひぐれ": 97,
484
+ "ひぐらし": 81
485
+ },
486
+ "梅雨": {
487
+ "つゆ": 471,
488
+ "ばいう": 284
489
+ },
490
+ "風穴": {
491
+ "かざあな": 300,
492
+ "ふうけつ": 68
493
+ },
494
+ "分別": {
495
+ "ふんべつ": 1280,
496
+ "ぶんべつ": 635
497
+ },
498
+ "夜話": {
499
+ "やわ": 2153,
500
+ "よばなし": 52
501
+ },
502
+ "野兎": {
503
+ "やと": 176,
504
+ "のうさぎ": 43
505
+ },
506
+ "冷水": {
507
+ "れいすい": 189,
508
+ "ひやみず": 153
509
+ },
510
+ "連中": {
511
+ "れんじゅう": 853,
512
+ "れんちゅう": 691
513
+ },
514
+ "飛沫": {
515
+ "ひまつ": 223,
516
+ "しぶき": 96
517
+ },
518
+ "翡翠": {
519
+ "ひすい": 177,
520
+ "かわせみ": 94
521
+ },
522
+ "一昨日": {
523
+ "おととい": 208,
524
+ "いっさくじつ": 71
525
+ },
526
+ "一昨年": {
527
+ "おととし": 72,
528
+ "いっさくねん": 59
529
+ },
530
+ "十八番": {
531
+ "じゅうはちばん": 212,
532
+ "おはこ": 41
533
+ },
534
+ "明後日": {
535
+ "あさって": 186,
536
+ "みょうごにち": 60
537
+ },
538
+ "石綿": {
539
+ "いしわた": 1702,
540
+ "せきめん": 360
541
+ },
542
+ "公文": {
543
+ "こうぶん": 196,
544
+ "くもん": 46
545
+ },
546
+ "読本": {
547
+ "どくほん": 12176,
548
+ "とくほん": 2414,
549
+ "よみほん": 121
550
+ },
551
+ "古本": {
552
+ "ふるほん": 550,
553
+ "こほん": 109
554
+ },
555
+ "町家": {
556
+ "まちや": 655,
557
+ "ちょうか": 216
558
+ },
559
+ "米": {
560
+ "べい": 17392,
561
+ "こめ": 9021,
562
+ "まい": 2829,
563
+ "よね": 620,
564
+ "ごめ": 164,
565
+ "めーとる": 112
566
+ }
567
+ }
stores/dbert/label_encoder.json ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class_to_index": {
3
+ "<OTHER>": 0,
4
+ "\u4e00\u5206:\u3044\u3061\u3076": 1,
5
+ "\u4e00\u5206:\u3044\u3061\u3076\u3093": 2,
6
+ "\u4e00\u5473:\u3044\u3061\u307f": 3,
7
+ "\u4e00\u5473:\u3072\u3068\u3042\u3058": 4,
8
+ "\u4e00\u58f0:\u3044\u3063\u305b\u3044": 5,
9
+ "\u4e00\u58f0:\u3072\u3068\u3053\u3048": 6,
10
+ "\u4e00\u591c:\u3044\u3061\u3084": 7,
11
+ "\u4e00\u591c:\u3072\u3068\u3088": 8,
12
+ "\u4e00\u5bf8:\u3044\u3063\u3059\u3093": 9,
13
+ "\u4e00\u5bf8:\u3061\u3087\u3063\u3068": 10,
14
+ "\u4e00\u6587:\u3044\u3061\u3076\u3093": 11,
15
+ "\u4e00\u6587:\u3044\u3061\u3082\u3093": 12,
16
+ "\u4e00\u65b9:\u3044\u3061\u307b\u3046": 13,
17
+ "\u4e00\u65b9:\u3044\u3063\u307d\u3046": 14,
18
+ "\u4e00\u65b9:\u3072\u3068\u304b\u305f": 15,
19
+ "\u4e00\u65e5:\u3044\u3061\u3058\u3064": 16,
20
+ "\u4e00\u65e5:\u3044\u3061\u306b\u3061": 17,
21
+ "\u4e00\u65e5:\u3064\u3044\u305f\u3061": 18,
22
+ "\u4e00\u6628\u5e74:\u3044\u3063\u3055\u304f\u306d\u3093": 19,
23
+ "\u4e00\u6628\u5e74:\u304a\u3068\u3068\u3057": 20,
24
+ "\u4e00\u6628\u65e5:\u3044\u3063\u3055\u304f\u3058\u3064": 21,
25
+ "\u4e00\u6628\u65e5:\u304a\u3068\u3068\u3044": 22,
26
+ "\u4e00\u6642:\u3044\u3061\u3058": 23,
27
+ "\u4e00\u6642:\u3044\u3061\u3069\u304d": 24,
28
+ "\u4e00\u6642:\u3044\u3063\u3068\u304d": 25,
29
+ "\u4e00\u6708:\u3044\u3061\u304c\u3064": 26,
30
+ "\u4e00\u6708:\u3072\u3068\u3064\u304d": 27,
31
+ "\u4e00\u76ee:\u3044\u3061\u3082\u304f": 28,
32
+ "\u4e00\u76ee:\u3072\u3068\u3081": 29,
33
+ "\u4e00\u884c:\u3044\u3061\u304e\u3087\u3046": 30,
34
+ "\u4e00\u884c:\u3044\u3063\u3053\u3046": 31,
35
+ "\u4e00\u8a00:\u3044\u3061\u3052\u3093": 32,
36
+ "\u4e00\u8a00:\u3044\u3061\u3054\u3093": 33,
37
+ "\u4e00\u8a00:\u3072\u3068\u3053\u3068": 34,
38
+ "\u4e00\u9014:\u3044\u3061\u305a": 35,
39
+ "\u4e00\u9014:\u3044\u3063\u3068": 36,
40
+ "\u4e0a\u4e0b:\u3046\u3048\u3057\u305f": 37,
41
+ "\u4e0a\u4e0b:\u3058\u3087\u3046\u3052": 38,
42
+ "\u4e0a\u624b:\u3046\u307e": 39,
43
+ "\u4e0a\u624b:\u3046\u308f\u3066": 40,
44
+ "\u4e0a\u624b:\u304b\u307f\u3066": 41,
45
+ "\u4e0a\u624b:\u3058\u3087\u3046\u305a": 42,
46
+ "\u4e0a\u65b9:\u304b\u307f\u304c\u305f": 43,
47
+ "\u4e0a\u65b9:\u3058\u3087\u3046\u307b\u3046": 44,
48
+ "\u4e0b\u624b:\u3057\u305f\u3066": 45,
49
+ "\u4e0b\u624b:\u3057\u3082\u3066": 46,
50
+ "\u4e0b\u624b:\u3078\u305f": 47,
51
+ "\u4e0b\u624b:\u3079\u305f": 48,
52
+ "\u4e0b\u91ce:\u3052\u3084": 49,
53
+ "\u4e0b\u91ce:\u3057\u3082\u3064\u3051": 50,
54
+ "\u4e0b\u91ce:\u3057\u3082\u306e": 51,
55
+ "\u4e16\u8ad6:\u305b\u308d\u3093": 52,
56
+ "\u4e16\u8ad6:\u3088\u308d\u3093": 53,
57
+ "\u4e2d\u9593:\u3061\u3085\u3046\u304b\u3093": 54,
58
+ "\u4e2d\u9593:\u3061\u3085\u3046\u3052\u3093": 55,
59
+ "\u4e8c\u4eba:\u306b\u306b\u3093": 56,
60
+ "\u4e8c\u4eba:\u3075\u305f\u308a": 57,
61
+ "\u4e8c\u91cd:\u306b\u3058\u3085\u3046": 58,
62
+ "\u4e8c\u91cd:\u3075\u305f\u3048": 59,
63
+ "\u4eba\u6c17:\u3058\u3093\u304d": 60,
64
+ "\u4eba\u6c17:\u306b\u3093\u304d": 61,
65
+ "\u4eba\u6c17:\u3072\u3068\u3051": 62,
66
+ "\u4eca\u65e5:\u304d\u3087\u3046": 63,
67
+ "\u4eca\u65e5:\u3053\u3093\u306b\u3061": 64,
68
+ "\u4eee\u540d:\u304b\u306a": 65,
69
+ "\u4eee\u540d:\u304b\u3081\u3044": 66,
70
+ "\u4eee\u540d:\u304c\u306a": 67,
71
+ "\u4f55\u5206:\u306a\u306b\u3076\u3093": 68,
72
+ "\u4f55\u5206:\u306a\u3093\u3077\u3093": 69,
73
+ "\u4f55\u6642:\u3044\u3064": 70,
74
+ "\u4f55\u6642:\u306a\u3093\u3058": 71,
75
+ "\u4f55\u6642:\u306a\u3093\u3069\u304d": 72,
76
+ "\u4f5c\u6cd5:\u3055\u304f\u307b\u3046": 73,
77
+ "\u4f5c\u6cd5:\u3055\u307b\u3046": 74,
78
+ "\u4fc2:\u304b\u304b": 75,
79
+ "\u4fc2:\u304b\u304b\u308a": 76,
80
+ "\u4fc2:\u304b\u304b\u308f": 77,
81
+ "\u4fc2:\u304c\u304b\u308a": 78,
82
+ "\u501f\u5bb6:\u3057\u3083\u304f\u3084": 79,
83
+ "\u501f\u5bb6:\u3057\u3083\u3063\u304b": 80,
84
+ "\u516c\u6587:\u304f\u3082\u3093": 81,
85
+ "\u516c\u6587:\u3053\u3046\u3076\u3093": 82,
86
+ "\u51b7\u6c34:\u3072\u3084\u307f\u305a": 83,
87
+ "\u51b7\u6c34:\u308c\u3044\u3059\u3044": 84,
88
+ "\u5206\u5225:\u3075\u3093\u3079\u3064": 85,
89
+ "\u5206\u5225:\u3076\u3093\u3079\u3064": 86,
90
+ "\u5229\u76ca:\u308a\u3048\u304d": 87,
91
+ "\u5229\u76ca:\u308a\u3084\u304f": 88,
92
+ "\u5341\u516b\u756a:\u304a\u306f\u3053": 89,
93
+ "\u5341\u516b\u756a:\u3058\u3085\u3046\u306f\u3061\u3070\u3093": 90,
94
+ "\u534a\u6708:\u306f\u3093\u3052\u3064": 91,
95
+ "\u534a\u6708:\u306f\u3093\u3064\u304d": 92,
96
+ "\u535a\u58eb:\u306f\u304b\u305b": 93,
97
+ "\u535a\u58eb:\u306f\u304f\u3057": 94,
98
+ "\u53e3\u8154:\u3053\u3046\u304f\u3046": 95,
99
+ "\u53e3\u8154:\u3053\u3046\u3053\u3046": 96,
100
+ "\u53e4\u672c:\u3053\u307b\u3093": 97,
101
+ "\u53e4\u672c:\u3075\u308b\u307b\u3093": 98,
102
+ "\u56fd\u7acb:\u304f\u306b\u305f\u3061": 99,
103
+ "\u56fd\u7acb:\u3053\u304f\u308a\u3064": 100,
104
+ "\u5893\u77f3:\u306f\u304b\u3044\u3057": 101,
105
+ "\u5893\u77f3:\u307c\u305b\u304d": 102,
106
+ "\u5909\u5316:\u3078\u3093\u304b": 103,
107
+ "\u5909\u5316:\u3078\u3093\u3052": 104,
108
+ "\u591c\u4e2d:\u3084\u3061\u3085\u3046": 105,
109
+ "\u591c\u4e2d:\u3088\u306a\u304b": 106,
110
+ "\u591c\u8a71:\u3084\u308f": 107,
111
+ "\u591c\u8a71:\u3088\u3070\u306a\u3057": 108,
112
+ "\u5927\u4e8b:\u304a\u304a\u3054\u3068": 109,
113
+ "\u5927\u4e8b:\u3060\u3044\u3058": 110,
114
+ "\u5927\u4eba:\u3046\u3057": 111,
115
+ "\u5927\u4eba:\u304a\u3068\u306a": 112,
116
+ "\u5927\u4eba:\u305f\u3044\u3058\u3093": 113,
117
+ "\u5927\u5206:\u304a\u304a\u3044\u305f": 114,
118
+ "\u5927\u5206:\u3060\u3044\u3076": 115,
119
+ "\u5927\u5206:\u3060\u3044\u3076\u3093": 116,
120
+ "\u5927\u52e2:\u304a\u304a\u305c\u3044": 117,
121
+ "\u5927\u52e2:\u305f\u3044\u305b\u3044": 118,
122
+ "\u5927\u5bb6:\u304a\u304a\u3084": 119,
123
+ "\u5927\u5bb6:\u305f\u3044\u304b": 120,
124
+ "\u5927\u5bb6:\u305f\u3044\u3051": 121,
125
+ "\u5927\u624b:\u304a\u304a\u3066": 122,
126
+ "\u5927\u624b:\u304a\u304a\u3067": 123,
127
+ "\u5927\u6587\u5b57:\u304a\u304a\u3082\u3058": 124,
128
+ "\u5927\u6587\u5b57:\u3060\u3044\u3082\u3093\u3058": 125,
129
+ "\u592b\u5a66:\u3075\u3046\u3075": 126,
130
+ "\u592b\u5a66:\u3081\u304a\u3068": 127,
131
+ "\u5bd2\u6c17:\u304b\u3093\u304d": 128,
132
+ "\u5bd2\u6c17:\u3055\u3080\u3051": 129,
133
+ "\u5c71\u9670:\u3055\u3093\u3044\u3093": 130,
134
+ "\u5c71\u9670:\u3084\u307e\u304b\u3052": 131,
135
+ "\u5e02\u5834:\u3044\u3061\u3070": 132,
136
+ "\u5e02\u5834:\u3057\u3058\u3087\u3046": 133,
137
+ "\u5f37\u529b:\u304d\u3087\u3046\u308a\u3087\u304f": 134,
138
+ "\u5f37\u529b:\u3054\u3046\u308a\u304d": 135,
139
+ "\u5fc3\u4e2d:\u3057\u3093\u3058\u3085\u3046": 136,
140
+ "\u5fc3\u4e2d:\u3057\u3093\u3061\u3085\u3046": 137,
141
+ "\u5fc3\u4e2d:\u3057\u3093\u3062\u3085\u3046": 138,
142
+ "\u6545\u90f7:\u304f\u306b": 139,
143
+ "\u6545\u90f7:\u3053\u304d\u3087\u3046": 140,
144
+ "\u6545\u90f7:\u3075\u308b\u3055\u3068": 141,
145
+ "\u6587\u5b57:\u3082\u3058": 142,
146
+ "\u6587\u5b57:\u3082\u3093\u3058": 143,
147
+ "\u6587\u66f8:\u3076\u3093\u3057\u3087": 144,
148
+ "\u6587\u66f8:\u3082\u3093\u3057\u3087": 145,
149
+ "\u6587\u66f8:\u3082\u3093\u3058\u3087": 146,
150
+ "\u65bd\u5de5:\u3057\u3053\u3046": 147,
151
+ "\u65bd\u5de5:\u305b\u3053": 148,
152
+ "\u65bd\u5de5:\u305b\u3053\u3046": 149,
153
+ "\u65bd\u696d:\u3057\u304e\u3087\u3046": 150,
154
+ "\u65bd\u696d:\u305b\u304e\u3087\u3046": 151,
155
+ "\u65bd\u884c:\u3057\u3053\u3046": 152,
156
+ "\u65bd\u884c:\u305b\u3053\u3046": 153,
157
+ "\u65e5\u4e2d:\u306b\u3061\u3058\u3085\u3046": 154,
158
+ "\u65e5\u4e2d:\u306b\u3063\u3061\u3085\u3046": 155,
159
+ "\u65e5\u5411:\u3072\u306a\u305f": 156,
160
+ "\u65e5\u5411:\u3072\u3085\u3046\u304c": 157,
161
+ "\u65e5\u66ae:\u3072\u3050": 158,
162
+ "\u65e5\u66ae:\u3072\u3050\u3089\u3057": 159,
163
+ "\u65e5\u66ae:\u3072\u3050\u308c": 160,
164
+ "\u660e\u5f8c\u65e5:\u3042\u3055\u3063\u3066": 161,
165
+ "\u660e\u5f8c\u65e5:\u307f\u3087\u3046\u3054\u306b\u3061": 162,
166
+ "\u660e\u65e5:\u3042\u3057\u305f": 163,
167
+ "\u660e\u65e5:\u3042\u3059": 164,
168
+ "\u660e\u65e5:\u307f\u3087\u3046\u306b\u3061": 165,
169
+ "\u6628\u65e5:\u304d\u306e\u3046": 166,
170
+ "\u6628\u65e5:\u3055\u304f\u3058\u3064": 167,
171
+ "\u6700\u4e2d:\u3055\u3044\u3061\u3085\u3046": 168,
172
+ "\u6700\u4e2d:\u3055\u306a\u304b": 169,
173
+ "\u672b\u671f:\u307e\u3063\u304d": 170,
174
+ "\u672b\u671f:\u307e\u3064\u3054": 171,
175
+ "\u6839\u672c:\u3053\u3093\u307d\u3093": 172,
176
+ "\u6839\u672c:\u306d\u3082\u3068": 173,
177
+ "\u6885\u96e8:\u3064\u3086": 174,
178
+ "\u6885\u96e8:\u3070\u3044\u3046": 175,
179
+ "\u6c17\u8cea:\u304b\u305f\u304e": 176,
180
+ "\u6c17\u8cea:\u304d\u3057\u3064": 177,
181
+ "\u6c17\u9aa8:\u304d\u3053\u3064": 178,
182
+ "\u6c17\u9aa8:\u304d\u307c\u306d": 179,
183
+ "\u6c34\u9762:\u3059\u3044\u3081\u3093": 180,
184
+ "\u6c34\u9762:\u307f\u306a\u3082": 181,
185
+ "\u6c42\u9053:\u304d\u3085\u3046\u3069\u3046": 182,
186
+ "\u6c42\u9053:\u3050\u3069\u3046": 183,
187
+ "\u6cd5\u8863:\u3053\u308d\u3082": 184,
188
+ "\u6cd5\u8863:\u307b\u3046\u3048": 185,
189
+ "\u6e05\u6c34:\u304d\u3088\u307f\u305a": 186,
190
+ "\u6e05\u6c34:\u3057\u307f\u305a": 187,
191
+ "\u6e05\u6d44:\u3057\u3087\u3046\u3058\u3087\u3046": 188,
192
+ "\u6e05\u6d44:\u305b\u3044\u3058\u3087\u3046": 189,
193
+ "\u6f22\u66f8:\u304b\u3089\u3076\u307f": 190,
194
+ "\u6f22\u66f8:\u304b\u3093\u3057\u3087": 191,
195
+ "\u6f22\u66f8:\u304b\u3093\u3058\u3087": 192,
196
+ "\u7267\u5834:\u307c\u304f\u3058\u3087\u3046": 193,
197
+ "\u7267\u5834:\u307e\u304d\u3070": 194,
198
+ "\u73a9\u5177:\u304a\u3082\u3061\u3083": 195,
199
+ "\u73a9\u5177:\u304c\u3093\u3050": 196,
200
+ "\u73fe\u4e16:\u3052\u3093\u305b": 197,
201
+ "\u73fe\u4e16:\u3052\u3093\u305b\u3044": 198,
202
+ "\u751f\u7269:\u3044\u304d\u3082\u306e": 199,
203
+ "\u751f\u7269:\u305b\u3044\u3076\u3064": 200,
204
+ "\u751f\u82b1:\u3044\u3051\u3070\u306a": 201,
205
+ "\u751f\u82b1:\u305b\u3044\u304b": 202,
206
+ "\u753a\u5bb6:\u3061\u3087\u3046\u304b": 203,
207
+ "\u753a\u5bb6:\u307e\u3061\u3084": 204,
208
+ "\u75be\u98a8:\u304b\u305c": 205,
209
+ "\u75be\u98a8:\u3057\u3063\u3077\u3046": 206,
210
+ "\u75be\u98a8:\u306f\u3084\u3066": 207,
211
+ "\u767d\u9aea:\u3057\u3089\u304c": 208,
212
+ "\u767d\u9aea:\u306f\u304f\u306f\u3064": 209,
213
+ "\u76f8\u4e57:\u3042\u3044\u306e": 210,
214
+ "\u76f8\u4e57:\u305d\u3046\u3058\u3087\u3046": 211,
215
+ "\u773c\u93e1:\u304c\u3093\u304d\u3087\u3046": 212,
216
+ "\u773c\u93e1:\u3081\u304c\u306d": 213,
217
+ "\u77f3\u7dbf:\u3044\u3057\u308f\u305f": 214,
218
+ "\u77f3\u7dbf:\u305b\u304d\u3081\u3093": 215,
219
+ "\u793c\u62dd:\u3089\u3044\u306f\u3044": 216,
220
+ "\u793c\u62dd:\u308c\u3044\u306f\u3044": 217,
221
+ "\u7af6\u58f2:\u304d\u3087\u3046\u3070\u3044": 218,
222
+ "\u7af6\u58f2:\u3051\u3044\u3070\u3044": 219,
223
+ "\u7c73:\u3053\u3081": 220,
224
+ "\u7c73:\u3054\u3081": 221,
225
+ "\u7c73:\u3079\u3044": 222,
226
+ "\u7c73:\u307e\u3044": 223,
227
+ "\u7c73:\u3081\u30fc\u3068\u308b": 224,
228
+ "\u7c73:\u3088\u306d": 225,
229
+ "\u7d05\u8449:\u3053\u3046\u3088\u3046": 226,
230
+ "\u7d05\u8449:\u3082\u307f\u3058": 227,
231
+ "\u7d30\u3005:\u3053\u307e\u3054\u307e": 228,
232
+ "\u7d30\u3005:\u307b\u305d\u307c\u305d": 229,
233
+ "\u7d30\u76ee:\u3055\u3044\u3082\u304f": 230,
234
+ "\u7d30\u76ee:\u307b\u305d\u3081": 231,
235
+ "\u7d4c\u7def:\u3044\u304d\u3055\u3064": 232,
236
+ "\u7d4c\u7def:\u3051\u3044\u3044": 233,
237
+ "\u7fe1\u7fe0:\u304b\u308f\u305b\u307f": 234,
238
+ "\u7fe1\u7fe0:\u3072\u3059\u3044": 235,
239
+ "\u80cc\u7b4b:\u305b\u3059\u3058": 236,
240
+ "\u80cc\u7b4b:\u306f\u3044\u304d\u3093": 237,
241
+ "\u8239\u5e95:\u305b\u3093\u3066\u3044": 238,
242
+ "\u8239\u5e95:\u3075\u306a\u305e\u3053": 239,
243
+ "\u82b1\u5f01:\u304b\u3079\u3093": 240,
244
+ "\u82b1\u5f01:\u306f\u306a\u3073\u3089": 241,
245
+ "\u83d6\u84b2:\u3042\u3084\u3081": 242,
246
+ "\u83d6\u84b2:\u3057\u3087\u3046\u3076": 243,
247
+ "\u8868:\u3042\u3089": 244,
248
+ "\u8868:\u3042\u3089\u308f": 245,
249
+ "\u8868:\u304a\u3082\u3066": 246,
250
+ "\u8868:\u3072\u3087\u3046": 247,
251
+ "\u898b\u7269:\u3051\u3093\u3076\u3064": 248,
252
+ "\u898b\u7269:\u307f\u3082\u306e": 249,
253
+ "\u89d2:\u304b\u304f": 250,
254
+ "\u89d2:\u304b\u3069": 251,
255
+ "\u89d2:\u3059\u307f": 252,
256
+ "\u89d2:\u3064\u306e": 253,
257
+ "\u8aad\u672c:\u3068\u304f\u307b\u3093": 254,
258
+ "\u8aad\u672c:\u3069\u304f\u307b\u3093": 255,
259
+ "\u8aad\u672c:\u3088\u307f\u307b\u3093": 256,
260
+ "\u8c37\u9593:\u305f\u306b\u3042\u3044": 257,
261
+ "\u8c37\u9593:\u305f\u306b\u307e": 258,
262
+ "\u8db3\u8de1:\u3042\u3057\u3042\u3068": 259,
263
+ "\u8db3\u8de1:\u305d\u304f\u305b\u304d": 260,
264
+ "\u8eab\u4f53:\u304b\u3089\u3060": 261,
265
+ "\u8eab\u4f53:\u3057\u3093\u305f\u3044": 262,
266
+ "\u8ee2\u751f:\u3066\u3093\u3057\u3087\u3046": 263,
267
+ "\u8ee2\u751f:\u3066\u3093\u305b\u3044": 264,
268
+ "\u8ffd\u5f93:\u3064\u3044\u3057\u3087\u3046": 265,
269
+ "\u8ffd\u5f93:\u3064\u3044\u3058\u3085\u3046": 266,
270
+ "\u9006\u624b:\u304e\u3083\u304f\u3066": 267,
271
+ "\u9006\u624b:\u3055\u304b\u3066": 268,
272
+ "\u9020\u4f5c:\u305e\u3046\u3055": 269,
273
+ "\u9020\u4f5c:\u305e\u3046\u3055\u304f": 270,
274
+ "\u9023\u4e2d:\u308c\u3093\u3058\u3085\u3046": 271,
275
+ "\u9023\u4e2d:\u308c\u3093\u3061\u3085\u3046": 272,
276
+ "\u907a\u8a00:\u3044\u3052\u3093": 273,
277
+ "\u907a\u8a00:\u3044\u3054\u3093": 274,
278
+ "\u907a\u8a00:\u3086\u3044\u3054\u3093": 275,
279
+ "\u91ce\u514e:\u306e\u3046\u3055\u304e": 276,
280
+ "\u91ce\u514e:\u3084\u3068": 277,
281
+ "\u91d1\u8272:\u304d\u3093\u3044\u308d": 278,
282
+ "\u91d1\u8272:\u3053\u3093\u3058\u304d": 279,
283
+ "\u9280\u674f:\u3044\u3061\u3087\u3046": 280,
284
+ "\u9280\u674f:\u304e\u3093\u306a\u3093": 281,
285
+ "\u958b\u773c:\u304b\u3044\u304c\u3093": 282,
286
+ "\u958b\u773c:\u304b\u3044\u3052\u3093": 283,
287
+ "\u982d\u6570:\u3042\u305f\u307e\u304b\u305a": 284,
288
+ "\u982d\u6570:\u3068\u3046\u3059\u3046": 285,
289
+ "\u982d\u84cb\u9aa8:\u305a\u304c\u3044\u3053\u3064": 286,
290
+ "\u982d\u84cb\u9aa8:\u3068\u3046\u304c\u3044\u3053\u3064": 287,
291
+ "\u98a8\u7a74:\u304b\u3056\u3042\u306a": 288,
292
+ "\u98a8\u7a74:\u3075\u3046\u3051\u3064": 289,
293
+ "\u98a8\u8eca:\u304b\u3056\u3050\u308b\u307e": 290,
294
+ "\u98a8\u8eca:\u3075\u3046\u3057\u3083": 291,
295
+ "\u98db\u6cab:\u3057\u3076\u304d": 292,
296
+ "\u98db\u6cab:\u3072\u307e\u3064": 293,
297
+ "\u9aa8:\u3053\u3064": 294,
298
+ "\u9aa8:\u307b\u306d": 295,
299
+ "\u9b5a:\u3046\u304a": 296,
300
+ "\u9b5a:\u304e\u3087": 297,
301
+ "\u9b5a:\u3055\u304b\u306a": 298,
302
+ "\u9b5a:\u3056\u304b\u306a": 299,
303
+ "\u9ed2\u5b50:\u304f\u308d\u3053": 300,
304
+ "\u9ed2\u5b50:\u307b\u304f\u308d": 301
305
+ }
306
+ }
stores/dbert/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da84a3d4be38191f4485086a8b4c7013a2ab33cf2c7d20df6c3fdfe0092041af
3
+ size 443657837
stores/dbert/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
stores/dbert/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "do_subword_tokenize": true,
5
+ "do_word_tokenize": true,
6
+ "jumanpp_kwargs": null,
7
+ "mask_token": "[MASK]",
8
+ "mecab_kwargs": {
9
+ "mecab_dic": "unidic_lite"
10
+ },
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "name_or_path": "cl-tohoku/bert-base-japanese-v2",
13
+ "never_split": null,
14
+ "pad_token": "[PAD]",
15
+ "sep_token": "[SEP]",
16
+ "special_tokens_map_file": null,
17
+ "subword_tokenizer_type": "wordpiece",
18
+ "sudachi_kwargs": null,
19
+ "tokenizer_class": "BertJapaneseTokenizer",
20
+ "unk_token": "[UNK]",
21
+ "word_tokenizer_type": "mecab"
22
+ }
stores/dbert/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aee0041b2ad4b019fea4db1f8aabd34b0081878cae2c17395657331db1adbb70
3
+ size 3579
stores/dbert/training_performance.json ADDED
The diff for this file is too large to render. See raw diff
 
stores/dbert/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
yomikata/__init__.py ADDED
File without changes
yomikata/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (155 Bytes). View file
 
yomikata/__pycache__/dbert.cpython-310.pyc ADDED
Binary file (10.9 kB). View file
 
yomikata/__pycache__/dictionary.cpython-310.pyc ADDED
Binary file (4 kB). View file
 
yomikata/__pycache__/evaluate.cpython-310.pyc ADDED
Binary file (4.98 kB). View file
 
yomikata/__pycache__/main.cpython-310.pyc ADDED
Binary file (2.84 kB). View file
 
yomikata/__pycache__/reader.cpython-310.pyc ADDED
Binary file (774 Bytes). View file
 
yomikata/__pycache__/t5.cpython-310.pyc ADDED
Binary file (5.18 kB). View file
 
yomikata/__pycache__/utils.cpython-310.pyc ADDED
Binary file (12.1 kB). View file
 
yomikata/dataset/__init__.py ADDED
File without changes
yomikata/dataset/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (163 Bytes). View file
 
yomikata/dataset/__pycache__/aozora.cpython-310.pyc ADDED
Binary file (2.99 kB). View file
 
yomikata/dataset/__pycache__/bccwj.cpython-310.pyc ADDED
Binary file (5.31 kB). View file
 
yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc ADDED
Binary file (2.47 kB). View file
 
yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc ADDED
Binary file (1.3 kB). View file
 
yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc ADDED
Binary file (1.44 kB). View file
 
yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc ADDED
Binary file (2.13 kB). View file
 
yomikata/dataset/__pycache__/split.cpython-310.pyc ADDED
Binary file (8.08 kB). View file
 
yomikata/dataset/__pycache__/sudachi.cpython-310.pyc ADDED
Binary file (1.15 kB). View file
 
yomikata/dataset/__pycache__/unidic.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
yomikata/dataset/aozora.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """aozora.py
2
+ Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
3
+ """
4
+
5
+ import warnings
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ from pandas.errors import ParserError
10
+ from speach import ttlig
11
+
12
+ from config import config
13
+ from config.config import logger
14
+ from yomikata import utils
15
+ from yomikata.dataset.repair_long_vowels import repair_long_vowels
16
+
17
+ warnings.filterwarnings("ignore")
18
+
19
+
20
+ def read_file(file: str):
21
+ # logger.info("reading file")
22
+ with open(file) as f:
23
+ rows = [
24
+ line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
25
+ ]
26
+ df = pd.DataFrame(rows, columns=["word", "furigana", "type"])
27
+
28
+ # logger.info("removing unused rows")
29
+ # remove unused rows
30
+ df = df[~df["type"].isin(["[入力 読み]", "分かち書き"])]
31
+ df = df[~pd.isna(df["word"])]
32
+ df = df[~pd.isnull(df["word"])]
33
+ df = df[df["word"] != ""]
34
+
35
+ # logger.info("organizing into sentences")
36
+ # now organize remaining rows into sentences
37
+ gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
38
+ sentence = ""
39
+ furigana = ""
40
+ sentenceid = None
41
+ gyous = []
42
+ for row in df.itertuples():
43
+ if row.type in ["[入力文]"]:
44
+ sentence = row.word
45
+ elif row.type in ["漢字"]:
46
+ furigana += ttlig.RubyToken.from_furi(
47
+ row.word, repair_long_vowels(row.furigana, row.word)
48
+ ).to_code()
49
+ elif row.word.split(":")[0] in ["行番号"]:
50
+ if sentenceid: # this handles the first row
51
+ gyous.append([sentence, furigana, sentenceid])
52
+ sentenceid = file.name + "_" + row.word.split(":")[1].strip()
53
+ sentence = None
54
+ furigana = ""
55
+ else:
56
+ furigana += row.word
57
+
58
+ # last row handling
59
+ gyous.append([sentence, furigana, sentenceid])
60
+
61
+ # make dataframe
62
+ gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
63
+ gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]
64
+
65
+ # logger.info("cleaning rows")
66
+ # clean rows
67
+ gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
68
+ gyou_df["sentence"] = gyou_df["sentence"].apply(
69
+ lambda s: utils.standardize_text(
70
+ s.replace("|", "").replace(" ", "").replace("※", "")
71
+ )
72
+ )
73
+
74
+ # logger.info("removing errors")
75
+ # remove non-matching rows
76
+ gyou_df = gyou_df[
77
+ gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
78
+ ]
79
+
80
+ # remove known errors
81
+ error_ids = []
82
+ gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]
83
+
84
+ # remove duplicates
85
+ gyou_df = gyou_df.drop_duplicates()
86
+
87
+ return gyou_df
88
+
89
+
90
+ def aozora_data():
91
+ """Extract, load and transform the aozora data"""
92
+
93
+ # Extract sentences from the data files
94
+ files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt"))
95
+
96
+ with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
97
+ f.write("sentence,furigana,sentenceid\n")
98
+
99
+ for i, file in enumerate(files):
100
+ logger.info(f"{i+1}/{len(files)} {file.name}")
101
+ try:
102
+ df = read_file(file)
103
+ except ParserError:
104
+ logger.error(f"Parser error on {file}")
105
+
106
+ df.to_csv(
107
+ Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
108
+ mode="a",
109
+ index=False,
110
+ header=False,
111
+ )
112
+
113
+ logger.info("✅ Saved all aozora data!")
114
+
115
+
116
+ if __name__ == "__main__":
117
+ aozora_data()
yomikata/dataset/bccwj.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """bccwj.py
2
+ Data processing script for files downloaded from Chuunagon search
3
+ Chuunagon URL: https://chunagon.ninjal.ac.jp/
4
+
5
+ Download with the settings
6
+ 文脈中の区切り記号 |
7
+ 文脈中の文区切り記号 #
8
+ 前後文脈の語数 10
9
+ 検索対象(固定長・可変長) 両方
10
+ 共起条件の範囲 文境界をまたがない
11
+
12
+ ダウンロードオプション
13
+ システム Linux
14
+ 文字コード UTF-8
15
+ 改行コード LF
16
+ 出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
17
+ インラインタグを使用 CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
18
+ (発音形出現形 is the actual pronounced one, but displays e.g. よう れい as よー れー)
19
+ タグの区切り記号 :
20
+ """
21
+
22
+ import warnings
23
+ from pathlib import Path
24
+
25
+ import jaconv
26
+ import pandas as pd
27
+ from speach.ttlig import RubyToken
28
+
29
+ from config import config
30
+ from config.config import logger
31
+ from yomikata import utils
32
+
33
+ warnings.filterwarnings("ignore")
34
+
35
+ SENTENCE_SPLIT_CHAR = "#"
36
+ WORD_SPLIT_CHAR = "|"
37
+ READING_SEP_CHAR = ":"
38
+
39
+
40
+ def read_bccwj_file(filename: str):
41
+ """ """
42
+
43
+ df = pd.read_csv(filename, sep="\t")
44
+
45
+ df["前文脈"] = df["前文脈"].fillna("")
46
+ df["後文脈"] = df["後文脈"].fillna("")
47
+ df["full_text"] = (
48
+ df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
49
+ )
50
+
51
+ def get_sentences(row):
52
+ sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
53
+ furigana_sentences = []
54
+ for sentence in sentences:
55
+ words_with_readings = sentence.split(WORD_SPLIT_CHAR)
56
+ furigana_sentence = ""
57
+ for word_with_reading in words_with_readings:
58
+ word = word_with_reading.split("[")[0]
59
+ form, reading = jaconv.kata2hira(
60
+ word_with_reading.split("[")[1].split("]")[0]
61
+ ).split(READING_SEP_CHAR)
62
+
63
+ if (
64
+ not utils.has_kanji(word)
65
+ or reading == jaconv.kata2hira(word)
66
+ or form == ""
67
+ or reading == ""
68
+ ):
69
+ furigana_sentence += word
70
+ else:
71
+ if ("ー" in reading) and ("ー" not in form):
72
+ indexes_of_dash = [
73
+ pos for pos, char in enumerate(reading) if char == "ー"
74
+ ]
75
+ for index_of_dash in indexes_of_dash:
76
+ if len(reading) == len(form):
77
+ dash_reading = form[index_of_dash]
78
+ else:
79
+ char_before_dash = reading[index_of_dash - 1]
80
+ if char_before_dash in "ねめせぜれてでけげへべぺ":
81
+ digraphA = char_before_dash + "え"
82
+ digraphB = char_before_dash + "い"
83
+ if digraphA in form and digraphB not in form:
84
+ dash_reading = "え"
85
+ elif digraphB in form and digraphA not in form:
86
+ dash_reading = "い"
87
+ else:
88
+ logger.warning(
89
+ f"Leaving dash in {word} {form} {reading}"
90
+ )
91
+ dash_reading = "ー"
92
+ elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
93
+ dash_reading = "う"
94
+ elif char_before_dash in "しじみいきぎひびち":
95
+ dash_reading = "い"
96
+ elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
97
+ digraphA = char_before_dash + "お"
98
+ digraphB = char_before_dash + "う"
99
+ if digraphA in form and digraphB not in form:
100
+ dash_reading = "お"
101
+ elif digraphB in form and digraphA not in form:
102
+ dash_reading = "う"
103
+ else:
104
+ if digraphA in word and digraphB not in word:
105
+ dash_reading = "お"
106
+ elif digraphB in word and digraphA not in word:
107
+ dash_reading = "う"
108
+ else:
109
+ logger.warning(
110
+ f"Leaving dash in {word} {form} {reading}"
111
+ )
112
+ dash_reading = "ー"
113
+ else:
114
+ logger.warning(
115
+ f"Leaving dash in {word} {form} {reading}"
116
+ )
117
+ dash_reading = "ー"
118
+ reading = (
119
+ reading[:index_of_dash]
120
+ + dash_reading
121
+ + reading[index_of_dash + 1 :]
122
+ )
123
+ furigana_sentence += RubyToken.from_furi(word, reading).to_code()
124
+
125
+ furigana_sentences.append(furigana_sentence)
126
+
127
+ furigana_sentences = [
128
+ utils.standardize_text(sentence) for sentence in furigana_sentences
129
+ ]
130
+ sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
131
+ try:
132
+ rowid = row["サンプル ID"]
133
+ except KeyError:
134
+ rowid = row["講演 ID"]
135
+ if len(furigana_sentences) == 1:
136
+ ids = [rowid]
137
+ else:
138
+ ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]
139
+
140
+ sub_df = pd.DataFrame(
141
+ {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
142
+ )
143
+
144
+ sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]
145
+
146
+ return sub_df
147
+
148
+ output_df = pd.DataFrame()
149
+ for i, row in df.iterrows():
150
+ output_df = output_df.append(get_sentences(row))
151
+
152
+ return output_df
153
+
154
+
155
+ def bccwj_data():
156
+ """Extract, load and transform the bccwj data"""
157
+
158
+ # Extract sentences from the data files
159
+ bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))
160
+
161
+ df = pd.DataFrame()
162
+
163
+ for bccwj_file in bccwj_files:
164
+ logger.info(bccwj_file.name)
165
+ df = pd.concat([df, read_bccwj_file(bccwj_file)])
166
+
167
+ # remove known errors
168
+ error_ids = []
169
+
170
+ df = df[~df["sentenceid"].isin(error_ids)]
171
+ df = df[df["sentence"] != ""]
172
+ df = df.drop_duplicates()
173
+ df["furigana"] = df["furigana"].apply(utils.standardize_text)
174
+ df["sentence"] = df["sentence"].apply(utils.standardize_text)
175
+ assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
176
+
177
+ # Output
178
+ df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)
179
+
180
+ logger.info("✅ Saved bccwj data!")
181
+
182
+
183
+ def bccwj_subset(bccwj_file):
184
+ """Extract, load and transform a subset of the bccwj data"""
185
+
186
+ df = read_bccwj_file(bccwj_file)
187
+
188
+ # remove known errors
189
+ error_ids = []
190
+
191
+ df = df[~df["sentenceid"].isin(error_ids)]
192
+ df = df.drop_duplicates()
193
+ df["furigana"] = df["furigana"].apply(utils.standardize_text)
194
+ df["sentence"] = df["sentence"].apply(utils.standardize_text)
195
+
196
+ # Output
197
+ df.to_csv(
198
+ Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
199
+ index=False,
200
+ )
201
+
202
+ logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")
203
+
204
+
205
+ if __name__ == "__main__":
206
+ bccwj_data()
yomikata/dataset/kwdlc.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """kwdlc.py
2
+ Data processing script for KWDLC files directly in the repository format
3
+ KWDLC repository: https://github.com/ku-nlp/KWDLC
4
+ """
5
+
6
+ import warnings
7
+ from pathlib import Path
8
+
9
+ import pandas as pd
10
+ from speach import ttlig
11
+
12
+ from config import config
13
+ from config.config import logger
14
+ from yomikata import utils
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+
19
+ def read_knp_file(filename: str):
20
+ with open(filename) as f:
21
+ contents = f.readlines()
22
+
23
+ ids = []
24
+ sentences = []
25
+ furiganas = []
26
+ sentence = ""
27
+ furigana = ""
28
+ for row in contents:
29
+ first_word = row.split(" ")[0]
30
+ if first_word in ["*", "+"]:
31
+ pass
32
+ elif first_word == "#":
33
+ sentence_id = row.split(" ")[1].split("S-ID:")[1]
34
+ elif first_word == "EOS\n":
35
+ sentence = utils.standardize_text(sentence)
36
+ furigana = utils.standardize_text(furigana)
37
+ if sentence == utils.remove_furigana(furigana):
38
+ sentences.append(sentence)
39
+ furiganas.append(furigana)
40
+ ids.append(sentence_id)
41
+ else:
42
+ logger.warning(
43
+ f"Dropping mismatched line \n Sentence: {sentence} \n Furigana: {furigana}"
44
+ )
45
+ sentence = ""
46
+ furigana = ""
47
+ else:
48
+ words = row.split(" ")
49
+ sentence += words[0]
50
+ if words[0] == words[1]:
51
+ furigana += words[0]
52
+ else:
53
+ furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()
54
+
55
+ assert len(ids) == len(sentences)
56
+ assert len(sentences) == len(furiganas)
57
+ return ids, sentences, furiganas # readings
58
+
59
+
60
+ def kwdlc_data():
61
+ """Extract, load and transform the kwdlc data"""
62
+
63
+ # Extract sentences from the data files
64
+ knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))
65
+
66
+ all_ids = []
67
+ all_sentences = []
68
+ all_furiganas = []
69
+ for knp_file in knp_files:
70
+ ids, sentences, furiganas = read_knp_file(knp_file)
71
+ all_ids += ids
72
+ all_sentences += sentences
73
+ all_furiganas += furiganas
74
+
75
+ # construct dataframe
76
+ df = pd.DataFrame(
77
+ list(
78
+ zip(all_sentences, all_furiganas, all_ids)
79
+ ), # all_readings, all_furiganas)),
80
+ columns=["sentence", "furigana", "sentenceid"],
81
+ )
82
+
83
+ # remove known errors
84
+ error_ids = [
85
+ "w201106-0000547376-1",
86
+ "w201106-0001768070-1-01",
87
+ "w201106-0000785999-1",
88
+ "w201106-0001500842-1",
89
+ "w201106-0000704257-1",
90
+ "w201106-0002300346-3",
91
+ "w201106-0001779669-3",
92
+ "w201106-0000259203-1",
93
+ ]
94
+
95
+ df = df[~df["sentenceid"].isin(error_ids)]
96
+ df = df.drop_duplicates()
97
+ df["furigana"] = df["furigana"].apply(utils.standardize_text)
98
+ df["sentence"] = df["sentence"].apply(utils.standardize_text)
99
+ # Test
100
+ assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
101
+
102
+ # Output
103
+ df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)
104
+
105
+ logger.info("✅ Saved kwdlc data!")
106
+
107
+
108
+ if __name__ == "__main__":
109
+ kwdlc_data()
yomikata/dataset/ndlbib.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ndlbib.py
2
+ Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
3
+ """
4
+
5
+ import warnings
6
+ from pathlib import Path
7
+
8
+ from pandas.errors import ParserError
9
+
10
+ from config import config
11
+ from config.config import logger
12
+ from yomikata.dataset.aozora import read_file
13
+
14
+ # ndlbib and aozora use same file structure
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+
19
+ def ndlbib_data():
20
+ """Extract, load and transform the ndlbib data"""
21
+
22
+ # Extract sentences from the data files
23
+ files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))
24
+
25
+ with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
26
+ f.write("sentence,furigana,sentenceid\n")
27
+
28
+ for i, file in enumerate(files):
29
+ logger.info(f"{i+1}/{len(files)} {file.name}")
30
+ try:
31
+ df = read_file(file)
32
+ except ParserError:
33
+ logger.error(f"Parser error on {file}")
34
+
35
+ df.to_csv(
36
+ Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
37
+ mode="a",
38
+ index=False,
39
+ header=False,
40
+ )
41
+
42
+ logger.info("✅ Saved ndlbib data!")
43
+
44
+
45
+ if __name__ == "__main__":
46
+ ndlbib_data()
yomikata/dataset/pronunciations.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import jaconv
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+ from config import config
8
+ from config.config import logger
9
+ from yomikata import utils
10
+
11
+
12
+ def pronunciation_data():
13
+
14
+ data_files = list(Path(config.READING_DATA_DIR).glob("*.csv"))
15
+
16
+ df = pd.DataFrame()
17
+
18
+ for file in data_files:
19
+ if (file.name == "all.csv") or (file.name == "ambiguous.csv"):
20
+ continue
21
+ output_df = pd.read_csv(file)
22
+ df = pd.concat([df, output_df])
23
+
24
+ df["surface"] = df["surface"].astype(str).str.strip()
25
+ df["kana"] = df["kana"].astype(str).str.strip()
26
+
27
+ tqdm.pandas()
28
+
29
+ df["kana"] = df["kana"].progress_apply(utils.standardize_text)
30
+ df["surface"] = df["surface"].progress_apply(utils.standardize_text)
31
+ df["kana"] = df.progress_apply(lambda row: jaconv.kata2hira(row["kana"]), axis=1)
32
+ df = df[df["surface"] != df["kana"]]
33
+ df = df[df["kana"] != ""]
34
+
35
+ df = df[df["surface"].progress_apply(utils.has_kanji)]
36
+
37
+ df = df.loc[~df["surface"].str.contains(r"[〜〜()\)\(\*]\.")]
38
+
39
+ df = df[["surface", "kana"]]
40
+ df = df.drop_duplicates()
41
+
42
+ df.to_csv(Path(config.READING_DATA_DIR, "all.csv"), index=False)
43
+
44
+ logger.info("✅ Merged all the pronunciation data!")
45
+
46
+ # merged_df = (
47
+ # df.groupby("surface")["kana"]
48
+ # .apply(list)
49
+ # .reset_index(name="pronunciations")
50
+ # )
51
+
52
+ # ambiguous_df = merged_df[merged_df["pronunciations"].apply(len) > 1]
53
+ # ambiguous_df.to_csv(Path(config.READING_DATA_DIR, "ambiguous.csv"), index=False)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ pronunciation_data()
yomikata/dataset/repair_long_vowels.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+
5
+ from config import config
6
+ from config.config import logger
7
+
8
+ pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
9
+ pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)
10
+
11
+
12
+ def repair_long_vowels(kana: str, kanji: str = None) -> str:
13
+ """Clean and normalize text
14
+
15
+ Args:
16
+ kana (str): input string
17
+ kanji (str): input string, optional
18
+
19
+ Returns:
20
+ str: a cleaned string
21
+ """
22
+
23
+ reading = kana
24
+ indices_of_dash = [pos for pos, char in enumerate(reading) if char == "ー"]
25
+
26
+ # get rid of non-ambiguous dashes
27
+ for index_of_dash in indices_of_dash:
28
+ char_before_dash = reading[index_of_dash - 1]
29
+ if char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
30
+ reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
31
+ elif char_before_dash in "しじみいきぎひびちぢぃ":
32
+ reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]
33
+
34
+ indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "ー"]
35
+ if len(indices_of_not_dash) != len(reading):
36
+ if not kanji:
37
+ logger.info("Disambiguating this dash requires kanji")
38
+ logger.info(f"Left dash in {reading}")
39
+ else:
40
+ try:
41
+ candidate_pronunciations = list(pronunciation_df[kanji])
42
+ except KeyError:
43
+ candidate_pronunciations = []
44
+
45
+ candidate_pronunciations = list(set(candidate_pronunciations))
46
+
47
+ candidate_pronunciations = [
48
+ x for x in candidate_pronunciations if len(x) == len(reading)
49
+ ]
50
+ candidate_pronunciations = [
51
+ x
52
+ for x in candidate_pronunciations
53
+ if all([x[i] == reading[i] for i in indices_of_not_dash])
54
+ ]
55
+
56
+ if len(candidate_pronunciations) == 1:
57
+ reading = candidate_pronunciations[0]
58
+ else:
59
+ pass
60
+ # logger.warning(f"Left dashes in {kanji} {reading}")
61
+
62
+ return reading
yomikata/dataset/split.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from speach.ttlig import RubyFrag, RubyToken
6
+
7
+ from config import config
8
+ from config.config import logger
9
+ from yomikata import utils
10
+ from yomikata.dictionary import Dictionary
11
+
12
+
13
+ def train_val_test_split(X, y, train_size, val_size, test_size):
14
+ """Split dataset into data splits."""
15
+ assert (train_size + val_size + test_size) == 1
16
+ X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)
17
+ X_val, X_test, y_val, y_test = train_test_split(
18
+ X_, y_, train_size=val_size / (test_size + val_size)
19
+ )
20
+ return X_train, X_val, X_test, y_train, y_val, y_test
21
+
22
+
23
+ def filter_simple(input_file, output_file, heteronyms) -> None:
24
+ """This filters out sentences which don't contain any heteronyms"""
25
+
26
+ df = pd.read_csv(input_file) # load
27
+ logger.info(f"Prefilter size: {len(df)}")
28
+
29
+ df = df[df["sentence"].str.contains(r"|".join(heteronyms))]
30
+ logger.info(f"Postfilter size: {len(df)}")
31
+
32
+ df.to_csv(output_file, index=False)
33
+
34
+
35
+ def filter_dictionary(input_file, output_file, heteronyms, dictionary) -> None:
36
+ """This filters out sentences which contain heteronyms only as part of a compound which is known to the dictionary"""
37
+ df = pd.read_csv(input_file) # load
38
+ logger.info(f"Prefilter size: {len(df)}")
39
+
40
+ df["contains_heteronym"] = df["sentence"].apply(
41
+ lambda s: not set(
42
+ [dictionary.token_to_surface(m) for m in dictionary.tagger(s)]
43
+ ).isdisjoint(heteronyms)
44
+ )
45
+
46
+ df = df[df["contains_heteronym"]]
47
+ logger.info(f"Postfilter size: {len(df)}")
48
+
49
+ df.to_csv(output_file, index=False)
50
+
51
+
52
+ def regroup_furigana(s, heteronym, heteronym_dict, dictionary, verbose=False):
53
+ rubytokens = utils.parse_furigana(s)
54
+ output_tokens = []
55
+ for token in rubytokens.groups:
56
+ if isinstance(token, RubyFrag):
57
+ # this is a token with furigana
58
+ if heteronym in token.text and token.text != heteronym:
59
+ # it includes the heteronym but is not exactly the heteronym
60
+ # if len(dictionary.tagger(token.text)) > 1:
61
+ # it is not in the dictionary, so we try to regroup it
62
+ # note this dictionary check is not foolproof: sometimes words are in the dictionary and found here,
63
+ # but in a parse of the whole sentence the word will be split in two.
64
+ # commented this out since actually even if it is part of dictionary, it will go through the training and so we might as well try to regroup it to avoid it being an <OTHER>
65
+ viable_regroupings = []
66
+ for reading in heteronym_dict[heteronym]:
67
+ regrouped_tokens = regroup_furigana_tokens(
68
+ [token], heteronym, reading, verbose=verbose
69
+ )
70
+ if regrouped_tokens != [token]:
71
+ if verbose:
72
+ print("viable regrouping found")
73
+ viable_regroupings.append(regrouped_tokens)
74
+ if len(viable_regroupings) == 1:
75
+ output_tokens += viable_regroupings[0]
76
+ continue
77
+ else:
78
+ if verbose:
79
+ print("multiple viable readings found, cannot regroup")
80
+ pass
81
+ output_tokens.append(token)
82
+
83
+ output_string = RubyToken(groups=output_tokens).to_code()
84
+ assert utils.furigana_to_kana(output_string) == utils.furigana_to_kana(s)
85
+ assert utils.remove_furigana(output_string) == utils.remove_furigana(s)
86
+ return output_string
87
+
88
+
89
+ def regroup_furigana_tokens(ruby_tokens, heteronym, reading, verbose=False):
90
+ if not len(ruby_tokens) == 1:
91
+ raise ValueError("regroup failed, no support yet for token merging")
92
+ ruby_token = ruby_tokens[0]
93
+ text = ruby_token.text
94
+ furi = ruby_token.furi
95
+ try:
96
+ split_text = [
97
+ text[0 : text.index(heteronym)],
98
+ heteronym,
99
+ text[text.index(heteronym) + len(heteronym) :],
100
+ ]
101
+ split_text = [text for text in split_text if text != ""]
102
+ except ValueError:
103
+ if verbose:
104
+ print("regroup failed, heteronym not in token text")
105
+ return ruby_tokens
106
+
107
+ try:
108
+ split_furi = [
109
+ furi[0 : furi.index(reading)],
110
+ reading,
111
+ furi[furi.index(reading) + len(reading) :],
112
+ ]
113
+ split_furi = [furi for furi in split_furi if furi != ""]
114
+ except ValueError:
115
+ if verbose:
116
+ print("regroup failed, reading not in token furi")
117
+ return ruby_tokens
118
+
119
+ if not len(split_text) == len(split_furi):
120
+ if verbose:
121
+ print(
122
+ "regroup failed, failed to find heteronym and its reading in the same place in the inputs"
123
+ )
124
+ return ruby_tokens
125
+
126
+ regrouped_tokens = [
127
+ RubyFrag(text=split_text[i], furi=split_furi[i]) for i in range(len(split_text))
128
+ ]
129
+
130
+ if not "".join([token.furi for token in ruby_tokens]) == "".join(
131
+ [token.furi for token in regrouped_tokens]
132
+ ):
133
+ if verbose:
134
+ print(
135
+ "regroup failed, reading of produced result does not agree with reading of input"
136
+ )
137
+ return ruby_tokens
138
+ if not [token.furi for token in regrouped_tokens if token.text == heteronym] == [
139
+ reading
140
+ ]:
141
+ if verbose:
142
+ print("regroup failed, the heteronym did not get assigned the reading")
143
+ return ruby_tokens
144
+ return regrouped_tokens
145
+
146
+
147
+ def optimize_furigana(input_file, output_file, heteronym_dict, dictionary) -> None:
148
+ df = pd.read_csv(input_file) # load
149
+ logger.info("Optimizing furigana using heteronym list and dictionary")
150
+ for heteronym in heteronym_dict.keys():
151
+ logger.info(f"Heteronym {heteronym} {heteronym_dict[heteronym]}")
152
+ n_with_het = sum(df["sentence"].str.contains(heteronym))
153
+ rows_to_rearrange = df["sentence"].str.contains(heteronym)
154
+ optimized_rows = df.loc[rows_to_rearrange, "furigana"].apply(
155
+ lambda s: regroup_furigana(s, heteronym, heteronym_dict, dictionary)
156
+ )
157
+ n_rearranged = sum(df.loc[rows_to_rearrange, "furigana"] != optimized_rows)
158
+ logger.info(f"{n_rearranged}/{n_with_het} sentences were optimized")
159
+ df.loc[rows_to_rearrange, "furigana"] = optimized_rows
160
+ df.to_csv(output_file, index=False)
161
+
162
+
163
+ def remove_other_readings(input_file, output_file, heteronym_dict):
164
+ df = pd.read_csv(input_file) # load
165
+ logger.info(f"Prefilter size: {len(df)}")
166
+ df["keep_row"] = False
167
+ for heteronym in heteronym_dict.keys():
168
+ logger.info(heteronym)
169
+ n_with_het = sum(df["sentence"].str.contains(heteronym))
170
+ keep_for_het = df["furigana"].str.contains(
171
+ r"|".join(
172
+ [f"{{{heteronym}/{reading}}}" for reading in heteronym_dict[heteronym]]
173
+ )
174
+ )
175
+ df["keep_row"] = df["keep_row"] | keep_for_het
176
+ logger.info(
177
+ f"Dropped {n_with_het-sum(keep_for_het)}/{n_with_het} sentences which have different readings"
178
+ ) # TODO reword
179
+ df = df.loc[df["keep_row"]]
180
+ df = df.drop("keep_row", axis=1)
181
+ df.to_csv(output_file, index=False)
182
+
183
+
184
+ def check_data(input_file) -> bool:
185
+
186
+ df = pd.read_csv(input_file) # load
187
+ df["furigana-test"] = df["sentence"] == df["furigana"].apply(utils.remove_furigana)
188
+ assert df["furigana-test"].all()
189
+ df["sentence-standardize-test"] = df["sentence"] == df["sentence"].apply(
190
+ utils.standardize_text
191
+ )
192
+ assert df["sentence-standardize-test"].all()
193
+
194
+ return True
195
+
196
+
197
+ def split_data(data_file) -> None:
198
+
199
+ df = pd.read_csv(data_file) # load
200
+
201
+ X = df["sentence"].values
202
+ y = df["furigana"].values
203
+
204
+ (X_train, X_val, X_test, y_train, y_val, y_test) = train_val_test_split(
205
+ X=X,
206
+ y=y,
207
+ train_size=config.TRAIN_SIZE,
208
+ val_size=config.VAL_SIZE,
209
+ test_size=config.TEST_SIZE,
210
+ )
211
+
212
+ train_df = pd.DataFrame({"sentence": X_train, "furigana": y_train})
213
+ val_df = pd.DataFrame({"sentence": X_val, "furigana": y_val})
214
+ test_df = pd.DataFrame({"sentence": X_test, "furigana": y_test})
215
+
216
+ train_df.to_csv(Path(config.TRAIN_DATA_DIR, "train_" + data_file.name), index=False)
217
+ val_df.to_csv(Path(config.VAL_DATA_DIR, "val_" + data_file.name), index=False)
218
+ test_df.to_csv(Path(config.TEST_DATA_DIR, "test_" + data_file.name), index=False)
219
+
220
+
221
+ if __name__ == "__main__":
222
+
223
+ input_files = [
224
+ Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
225
+ Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"),
226
+ Path(config.SENTENCE_DATA_DIR, "bccwj.csv"),
227
+ Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
228
+ ]
229
+
230
+ logger.info("Merging sentence data")
231
+ utils.merge_csvs(input_files, Path(config.SENTENCE_DATA_DIR, "all.csv"), n_header=1)
232
+
233
+ logger.info("Rough filtering for sentences with heteronyms")
234
+ filter_simple(
235
+ Path(config.SENTENCE_DATA_DIR, "all.csv"),
236
+ Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
237
+ config.HETERONYMS.keys(),
238
+ )
239
+
240
+ logger.info("Sudachidict filtering for out heteronyms in known compounds")
241
+ filter_dictionary(
242
+ Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
243
+ Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
244
+ config.HETERONYMS.keys(),
245
+ Dictionary("sudachi"),
246
+ )
247
+
248
+ logger.info("Optimizing furigana")
249
+ optimize_furigana(
250
+ Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
251
+ Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
252
+ config.HETERONYMS,
253
+ Dictionary("sudachi"),
254
+ )
255
+
256
+ logger.info("Removing heteronyms with unexpected readings")
257
+ remove_other_readings(
258
+ Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
259
+ Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"),
260
+ config.HETERONYMS,
261
+ )
262
+
263
+ logger.info("Running checks on data")
264
+ test_result = check_data(
265
+ Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv")
266
+ )
267
+
268
+ logger.info("Performing train/test/split")
269
+ split_data(Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"))
270
+
271
+ logger.info("Data splits successfully generated!")
yomikata/dataset/sudachi.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """sudachi.py
2
+ Data processing script for sudachi dictionary
3
+ """
4
+
5
+ import warnings
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from config import config
11
+ from config.config import logger
12
+
13
+ warnings.filterwarnings("ignore")
14
+
15
+
16
+ def sudachi_data():
17
+
18
+ sudachi_file = list(Path(config.RAW_DATA_DIR, "sudachi").glob("*.csv"))
19
+
20
+ df = pd.DataFrame()
21
+
22
+ for file in sudachi_file:
23
+ logger.info(file.name)
24
+ # Load file
25
+ df = pd.concat(
26
+ [
27
+ df,
28
+ pd.read_csv(
29
+ file,
30
+ header=None,
31
+ ),
32
+ ]
33
+ )
34
+
35
+ df["surface"] = df[0].astype(str).str.strip()
36
+ df["kana"] = df[11].astype(str).str.strip()
37
+ df["type"] = df[5].astype(str).str.strip()
38
+ df = df[df["kana"] != "*"]
39
+ df = df[df["surface"] != df["kana"]]
40
+ df = df[df["type"] != "補助記号"]
41
+
42
+ df = df[["surface", "kana"]]
43
+
44
+ df.to_csv(Path(config.READING_DATA_DIR, "sudachi.csv"), index=False)
45
+
46
+ logger.info("✅ Processed sudachi data!")
47
+
48
+
49
+ if __name__ == "__main__":
50
+ sudachi_data()
yomikata/dataset/unidic.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """unidic.py
2
+ Data processing script for unidic dictionary
3
+ """
4
+
5
+ import warnings
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from config import config
11
+ from config.config import logger
12
+
13
+ warnings.filterwarnings("ignore")
14
+
15
+
16
+ def unidic_data():
17
+ """Extract, load and transform the unidic data"""
18
+
19
+ # Extract sentences from the data files
20
+ unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0]
21
+
22
+ # Load file
23
+ df = pd.read_csv(
24
+ unidic_file,
25
+ header=None,
26
+ names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType "
27
+ "cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType "
28
+ "fForm iConType fConType type kana kanaBase form formBase aType aConType "
29
+ "aModType lid lemma_id".split(" "),
30
+ )
31
+
32
+ df["surface"] = df["surface"].astype(str).str.strip()
33
+ df["kana"] = df["kana"].astype(str).str.strip()
34
+ df = df[df["kana"] != "*"]
35
+ df = df[df["surface"] != df["kana"]]
36
+ df = df[["surface", "kana"]]
37
+
38
+ df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False)
39
+
40
+ logger.info("✅ Processed unidic data!")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ unidic_data()
yomikata/dbert.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dbert.py
3
+ Provides the dBert class that implements Reader using BERT contextual embeddings to disambiguate heteronyms.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ from pathlib import Path
9
+
10
+ import numpy as np
11
+ import torch
12
+ from speach.ttlig import RubyFrag, RubyToken
13
+ from transformers import (
14
+ AutoModelForTokenClassification,
15
+ BertJapaneseTokenizer,
16
+ DataCollatorForTokenClassification,
17
+ EarlyStoppingCallback,
18
+ Trainer,
19
+ TrainingArguments,
20
+ )
21
+
22
+ from config import config
23
+ from config.config import logger
24
+ from yomikata import utils
25
+ from yomikata.reader import Reader
26
+ from yomikata.utils import LabelEncoder
27
+
28
+ logging.getLogger("transformers").setLevel(logging.ERROR)
29
+ logging.getLogger("transformers.trainer").setLevel(logging.ERROR)
30
+ logging.getLogger("datasets").setLevel(logging.ERROR)
31
+
32
+
33
+ class dBert(Reader):
34
+ def __init__(
35
+ self,
36
+ artifacts_dir: Path = Path(config.STORES_DIR, "dbert"),
37
+ reinitialize: bool = False,
38
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
39
+ ) -> None:
40
+
41
+ # Set the device
42
+ self.device = device
43
+ logger.info(f"Running on {self.device}")
44
+ if self.device.type == "cuda":
45
+ logger.info(torch.cuda.get_device_name(0))
46
+
47
+ # Hardcoded parameters
48
+ self.max_length = 128
49
+
50
+ # Load the model
51
+ self.artifacts_dir = artifacts_dir
52
+ if reinitialize:
53
+
54
+ # load tokenizer from upstream huggingface repository
55
+ default_model = "cl-tohoku/bert-base-japanese-v2"
56
+ self.tokenizer = BertJapaneseTokenizer.from_pretrained(default_model)
57
+ logger.info(f"Using {default_model} tokenizer")
58
+
59
+ # load the heteronyms list
60
+ self.heteronyms = config.HETERONYMS
61
+
62
+ # make the label encoder
63
+ label_list = ["<OTHER>"]
64
+ for i, heteronym in enumerate(self.heteronyms.keys()):
65
+ for j, reading in enumerate(self.heteronyms[heteronym]):
66
+ label_list.append(heteronym + ":" + reading)
67
+
68
+ self.label_encoder = LabelEncoder()
69
+ self.label_encoder.fit(label_list)
70
+
71
+ logger.info("Made label encoder with default heteronyms")
72
+
73
+ # add surface forms to tokenizer vocab
74
+ surfaces = list(
75
+ set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])
76
+ )
77
+
78
+ new_tokens = [
79
+ surface
80
+ for surface in surfaces
81
+ if surface
82
+ not in (list(self.tokenizer.vocab.keys()) + list(self.tokenizer.get_added_vocab()))
83
+ ]
84
+
85
+ self.tokenizer.add_tokens(new_tokens)
86
+ if len(new_tokens) > 0:
87
+ logger.info(f"Added {len(new_tokens)} surface forms to tokenizer vocab")
88
+
89
+ # check that new tokens were added properly
90
+ assert [
91
+ self.tokenizer.decode(
92
+ self.tokenizer.encode(
93
+ [surface],
94
+ add_special_tokens=False,
95
+ )
96
+ )
97
+ for surface in surfaces
98
+ ] == surfaces
99
+
100
+ self.surfaceIDs = self.tokenizer.encode(
101
+ list(set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])),
102
+ add_special_tokens=False,
103
+ )
104
+ assert len(self.surfaceIDs) == len(surfaces)
105
+
106
+ # Load model from upstream huggingface repository
107
+ self.model = AutoModelForTokenClassification.from_pretrained(
108
+ default_model, num_labels=len(self.label_encoder.classes)
109
+ )
110
+ self.model.resize_token_embeddings(len(self.tokenizer))
111
+ logger.info(f"Using model {default_model}")
112
+
113
+ self.save(artifacts_dir)
114
+ else:
115
+ self.load(artifacts_dir)
116
+
117
+ def load(self, directory):
118
+ self.tokenizer = BertJapaneseTokenizer.from_pretrained(directory)
119
+ self.model = AutoModelForTokenClassification.from_pretrained(directory).to(self.device)
120
+ self.label_encoder = LabelEncoder.load(Path(directory, "label_encoder.json"))
121
+ self.heteronyms = utils.load_dict(Path(directory, "heteronyms.json"))
122
+
123
+ self.surfaceIDs = self.tokenizer.encode(
124
+ list(set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])),
125
+ add_special_tokens=False,
126
+ )
127
+ logger.info(f"Loaded model from directory {directory}")
128
+
129
+ def save(self, directory):
130
+ self.tokenizer.save_pretrained(directory)
131
+ self.model.save_pretrained(directory)
132
+ self.label_encoder.save(Path(directory, "label_encoder.json"))
133
+ utils.save_dict(self.heteronyms, Path(directory, "heteronyms.json"))
134
+ logger.info(f"Saved model to directory {directory}")
135
+
136
+ def batch_preprocess_function(self, entries, pad=False):
137
+ inputs = [entry for entry in entries["sentence"]]
138
+ furiganas = [entry for entry in entries["furigana"]]
139
+ if pad:
140
+ tokenized_inputs = self.tokenizer(
141
+ inputs,
142
+ max_length=self.max_length,
143
+ truncation=True,
144
+ padding="max_length",
145
+ # return_tensors="np",
146
+ )
147
+ else:
148
+ tokenized_inputs = self.tokenizer(
149
+ inputs,
150
+ max_length=self.max_length,
151
+ truncation=True,
152
+ )
153
+
154
+ labels = []
155
+ for i, input_ids in enumerate(tokenized_inputs["input_ids"]):
156
+ furigana_temp = furiganas[i]
157
+ label_ids = []
158
+ assert inputs[i] == utils.remove_furigana(furiganas[i])
159
+ for j, input_id in enumerate(input_ids):
160
+ if input_id not in self.surfaceIDs:
161
+ label = -100
162
+ else:
163
+ surface = self.tokenizer.decode([input_id])
164
+ try:
165
+ reading_start_idx = furigana_temp.index(surface) + len(surface)
166
+ furigana_temp = furigana_temp[reading_start_idx + 1 :]
167
+ reading_end_idx = furigana_temp.index("}")
168
+ reading = furigana_temp[:reading_end_idx]
169
+ furigana_temp = furigana_temp[reading_end_idx + 1 :]
170
+ label = self.label_encoder.class_to_index[surface + ":" + reading]
171
+ except KeyError:
172
+ # this means there's an unknown reading
173
+ label = 0
174
+ except ValueError:
175
+ # this means that the surface form is not present in the furigana
176
+ # probably it got split between two different words
177
+ label = 0
178
+ label_ids.append(label)
179
+ assert len(label_ids) == len(input_ids)
180
+ labels.append(label_ids)
181
+
182
+ assert len(labels) == len(tokenized_inputs["input_ids"])
183
+
184
+ return {
185
+ "input_ids": tokenized_inputs["input_ids"],
186
+ "attention_mask": tokenized_inputs["attention_mask"],
187
+ "labels": labels,
188
+ }
189
+
190
+ def train(self, dataset, training_args={}) -> dict:
191
+
192
+ dataset = dataset.map(
193
+ self.batch_preprocess_function, batched=True, fn_kwargs={"pad": False}
194
+ )
195
+ dataset = dataset.filter(
196
+ lambda entry: any(x in entry["input_ids"] for x in list(self.surfaceIDs))
197
+ )
198
+
199
+ # put the model in training mode
200
+ self.model.train()
201
+
202
+ default_training_args = {
203
+ "output_dir": self.artifacts_dir,
204
+ "num_train_epochs": 10,
205
+ "evaluation_strategy": "steps",
206
+ "eval_steps": 10,
207
+ "logging_strategy": "steps",
208
+ "logging_steps": 10,
209
+ "save_strategy": "steps",
210
+ "save_steps": 10,
211
+ "learning_rate": 2e-5,
212
+ "per_device_train_batch_size": 128,
213
+ "per_device_eval_batch_size": 128,
214
+ "load_best_model_at_end": True,
215
+ "metric_for_best_model": "loss",
216
+ "weight_decay": 0.01,
217
+ "save_total_limit": 3,
218
+ "fp16": True,
219
+ "report_to": "tensorboard",
220
+ }
221
+
222
+ default_training_args.update(training_args)
223
+ training_args = default_training_args
224
+
225
+ # Not padding in batch_preprocess_function so need data_collator for trainer
226
+ data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer, padding=True)
227
+
228
+ if "val" in list(dataset):
229
+ trainer = Trainer(
230
+ model=self.model,
231
+ args=TrainingArguments(**training_args),
232
+ train_dataset=dataset["train"],
233
+ eval_dataset=dataset["val"],
234
+ tokenizer=self.tokenizer,
235
+ callbacks=[
236
+ EarlyStoppingCallback(early_stopping_patience=5),
237
+ ],
238
+ data_collator=data_collator,
239
+ )
240
+ else:
241
+ trainer = Trainer(
242
+ model=self.model,
243
+ args=TrainingArguments(**training_args),
244
+ train_dataset=dataset["train"],
245
+ tokenizer=self.tokenizer,
246
+ data_collator=data_collator,
247
+ )
248
+
249
+ result = trainer.train()
250
+
251
+ # Output some training information
252
+ print(f"Time: {result.metrics['train_runtime']:.2f}")
253
+ print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
254
+ gpu_index = int(os.environ["CUDA_VISIBLE_DEVICES"])
255
+ utils.print_gpu_utilization(gpu_index)
256
+
257
+ # Get metrics for each train/val/split
258
+ self.model.eval()
259
+ full_performance = {}
260
+ for key in dataset.keys():
261
+ max_evals = min(100000, len(dataset[key]))
262
+ # max_evals = len(dataset[key])
263
+ logger.info(f"getting predictions for {key}")
264
+ subset = dataset[key].shuffle().select(range(max_evals))
265
+ prediction_output = trainer.predict(subset)
266
+ logger.info(f"processing predictions for {key}")
267
+ metrics = prediction_output[2]
268
+ labels = prediction_output[1]
269
+ predictions = np.argmax(prediction_output[0], axis=2)
270
+
271
+ true_inputs = [
272
+ self.tokenizer.decode([input_id])
273
+ for row in subset["input_ids"]
274
+ for input_id in row
275
+ if input_id in self.surfaceIDs
276
+ ]
277
+
278
+ true_predictions = [
279
+ str(self.label_encoder.index_to_class[p])
280
+ for prediction, label in zip(predictions, labels)
281
+ for (p, l) in zip(prediction, label)
282
+ if l != -100
283
+ ]
284
+
285
+ true_labels = [
286
+ str(self.label_encoder.index_to_class[l])
287
+ for prediction, label in zip(predictions, labels)
288
+ for (p, l) in zip(prediction, label)
289
+ if l != -100
290
+ ]
291
+
292
+ logger.info("processing performance")
293
+ performance = {
294
+ heteronym: {
295
+ "n": 0,
296
+ "readings": {
297
+ reading: {
298
+ "n": 0,
299
+ "found": {
300
+ readingprime: 0
301
+ for readingprime in list(self.heteronyms[heteronym].keys())
302
+ + ["<OTHER>"]
303
+ },
304
+ }
305
+ for reading in list(self.heteronyms[heteronym].keys()) + ["<OTHER>"]
306
+ },
307
+ }
308
+ for heteronym in self.heteronyms.keys()
309
+ }
310
+
311
+ for i, surface in enumerate(true_inputs):
312
+ performance[surface]["n"] += 1
313
+
314
+ true_reading = true_labels[i].split(":")[-1]
315
+
316
+ performance[surface]["readings"][true_reading]["n"] += 1
317
+
318
+ if true_predictions[i] != "<OTHER>":
319
+ if true_predictions[i].split(":")[0] != surface:
320
+ logger.warning(f"big failure at {surface} {true_predictions[i]}")
321
+ found_reading = "<OTHER>"
322
+ else:
323
+ found_reading = true_predictions[i].split(":")[1]
324
+ else:
325
+ found_reading = "<OTHER>"
326
+
327
+ performance[surface]["readings"][true_reading]["found"][found_reading] += 1
328
+
329
+ # if found_reading != true_reading:
330
+ # # pass
331
+ # logger.info(
332
+ # f"Predicted {found_reading} instead of {true_reading} in {subset["furigana"][furi_rows[i]]}"
333
+ # )
334
+
335
+ n = 0
336
+ correct = 0
337
+ for surface in performance.keys():
338
+ for true_reading in performance[surface]["readings"].keys():
339
+ performance[surface]["readings"][true_reading]["accuracy"] = np.round(
340
+ performance[surface]["readings"][true_reading]["found"][true_reading]
341
+ / np.array(performance[surface]["readings"][true_reading]["n"]),
342
+ 3,
343
+ )
344
+
345
+ performance[surface]["accuracy"] = np.round(
346
+ sum(
347
+ performance[surface]["readings"][true_reading]["found"][true_reading]
348
+ for true_reading in performance[surface]["readings"].keys()
349
+ )
350
+ / np.array(performance[surface]["n"]),
351
+ 3,
352
+ )
353
+
354
+ correct += sum(
355
+ performance[surface]["readings"][true_reading]["found"][true_reading]
356
+ for true_reading in performance[surface]["readings"].keys()
357
+ )
358
+ n += performance[surface]["n"]
359
+
360
+ performance = {
361
+ "metrics": metrics,
362
+ "accuracy": round(correct / n, 3),
363
+ "heteronym_performance": performance,
364
+ }
365
+
366
+ full_performance[key] = performance
367
+
368
+ trainer.save_model()
369
+
370
+ return full_performance
371
+
372
+ def furigana(self, text: str) -> str:
373
+
374
+ text = utils.standardize_text(text)
375
+ text = utils.remove_furigana(text)
376
+ text = text.replace("{", "").replace("}", "")
377
+
378
+ self.model.eval()
379
+
380
+ text_encoded = self.tokenizer(
381
+ text,
382
+ max_length=self.max_length,
383
+ truncation=True,
384
+ return_tensors="pt",
385
+ )
386
+
387
+ input_ids = text_encoded["input_ids"].to(self.device)
388
+ input_mask = text_encoded["attention_mask"].to(self.device)
389
+
390
+ logits = self.model(input_ids=input_ids, attention_mask=input_mask).logits
391
+
392
+ predictions = torch.argmax(logits, dim=2)
393
+
394
+ output_ruby = []
395
+ for (i, p) in enumerate(predictions[0]):
396
+ text = self.tokenizer.decode([input_ids[0][i]])
397
+ if text in ["[CLS]", "[SEP]"]:
398
+ continue
399
+ if text[:2] == "##":
400
+ text = text[2:]
401
+ if input_ids[0][i].item() in self.surfaceIDs:
402
+
403
+ furi = self.label_encoder.index_to_class[p.item()]
404
+
405
+ if furi == "<OTHER>":
406
+ output_ruby.append(f"{{{text}}}")
407
+ elif furi.split(":")[0] != text:
408
+ output_ruby.append(f"{{{text}}}")
409
+ else:
410
+ output_ruby.append(RubyFrag(text=text, furi=furi.split(":")[1]))
411
+ else:
412
+ output_ruby.append(text)
413
+
414
+ return RubyToken(groups=output_ruby).to_code()
yomikata/dictionary.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dictionary.py
3
+ Provides the Dictionary class which implements Reader using dictionary lookup.
4
+ """
5
+
6
+ import fugashi
7
+ import ipadic
8
+ import jaconv
9
+ import jumandic
10
+ from speach import ttlig
11
+ from sudachipy import dictionary as sudachidict
12
+ from sudachipy import tokenizer as sudachitokenizer
13
+
14
+ from config.config import ASCII_SPACE_TOKEN
15
+ from yomikata import utils
16
+ from yomikata.reader import Reader
17
+
18
+ tokenizer_obj = sudachidict.Dictionary(dict="full").create()
19
+ mode = sudachitokenizer.Tokenizer.SplitMode.C
20
+
21
+ taggers = {}
22
+ taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
23
+ taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
24
+ taggers["unidic"] = fugashi.Tagger()
25
+ taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
26
+
27
+ token_to_kana = {
28
+ "ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
29
+ if len(word.feature) >= 8
30
+ else jaconv.kata2hira(str(word.surface)),
31
+ "juman": lambda word: word.feature[5]
32
+ if word.feature[5] != "*"
33
+ else jaconv.kata2hira(str(word)),
34
+ "unidic": lambda word: jaconv.kata2hira(str(word))
35
+ if (word.feature.kana == "*" or word.feature.kana is None)
36
+ else jaconv.kata2hira(str(word.feature.kana)),
37
+ "sudachi": lambda word: jaconv.kata2hira(
38
+ utils.standardize_text(str(word.reading_form()))
39
+ ),
40
+ }
41
+
42
+ token_to_surface = {
43
+ "ipadic": lambda word: word.surface,
44
+ "juman": lambda word: word.surface,
45
+ "unidic": lambda word: word.surface,
46
+ "sudachi": lambda word: word.surface(),
47
+ }
48
+
49
+ token_to_pos = {
50
+ "ipadic": lambda word: word.feature[0],
51
+ "juman": lambda word: word.feature[0],
52
+ "unidic": lambda word: word.feature.pos1,
53
+ "sudachi": lambda word: word.part_of_speech()[0],
54
+ }
55
+
56
+
57
+ class Dictionary(Reader):
58
+ def __init__(self, tagger: str = "unidic") -> None:
59
+ """Create a Dictionary object to apply furigana using Dictionary lookup
60
+ Object holds configuration and tokenizer state.
61
+
62
+ Typical usage:
63
+
64
+ ```python
65
+ reader = Dictionary()
66
+ furi = Dictionary.furigana("お前はもう死んでいる")
67
+ # "お{前/まえ}はもう{死/し}んでいる"
68
+ ```
69
+
70
+ Args:
71
+ tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
72
+ """
73
+
74
+ self.tagger = taggers[tagger]
75
+ self.token_to_kana = token_to_kana[tagger]
76
+ self.token_to_surface = token_to_surface[tagger]
77
+ self.token_to_pos = token_to_pos[tagger]
78
+
79
+ def furigana(self, text: str) -> str:
80
+ text = utils.standardize_text(text)
81
+ text = text.replace(" ", ASCII_SPACE_TOKEN)
82
+ rubytoken = utils.parse_furigana(text)
83
+ output = ""
84
+
85
+ for group in rubytoken.groups:
86
+ if isinstance(group, ttlig.RubyFrag):
87
+ output += f"{{{group.text}/{group.furi}}}"
88
+ else:
89
+ group = group.replace("{", "").replace("}", "")
90
+ for word in self.tagger(group):
91
+ kana = self.token_to_kana(word)
92
+ surface = self.token_to_surface(word)
93
+ pos = self.token_to_pos(word)
94
+ if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
95
+ output += surface
96
+ else:
97
+ output += ttlig.RubyToken.from_furi(surface, kana).to_code()
98
+ output = output.replace(ASCII_SPACE_TOKEN, " ")
99
+ return output