|
from enum import Enum |
|
from re import findall, fullmatch |
|
from typing import List, Optional |
|
|
|
from pydantic import BaseModel, Field, validator |
|
|
|
USER_DICT_MIN_PRIORITY = 0 |
|
USER_DICT_MAX_PRIORITY = 10 |
|
|
|
|
|
class UserDictWord(BaseModel): |
|
""" |
|
辞書のコンパイルに使われる情報 |
|
""" |
|
|
|
surface: str = Field(title="表層形") |
|
priority: int = Field( |
|
title="優先度", ge=USER_DICT_MIN_PRIORITY, le=USER_DICT_MAX_PRIORITY |
|
) |
|
context_id: int = Field(title="文脈ID", default=1348) |
|
part_of_speech: str = Field(title="品詞") |
|
part_of_speech_detail_1: str = Field(title="品詞細分類1") |
|
part_of_speech_detail_2: str = Field(title="品詞細分類2") |
|
part_of_speech_detail_3: str = Field(title="品詞細分類3") |
|
inflectional_type: str = Field(title="活用型") |
|
inflectional_form: str = Field(title="活用形") |
|
stem: str = Field(title="原形") |
|
yomi: str = Field(title="読み") |
|
pronunciation: str = Field(title="発音") |
|
accent_type: int = Field(title="アクセント型") |
|
mora_count: Optional[int] = Field(title="モーラ数", default=None) |
|
accent_associative_rule: str = Field(title="アクセント結合規則") |
|
|
|
class Config: |
|
validate_assignment = True |
|
|
|
@validator("surface") |
|
def convert_to_zenkaku(cls, surface): |
|
return surface.translate( |
|
str.maketrans( |
|
"".join(chr(0x21 + i) for i in range(94)), |
|
"".join(chr(0xFF01 + i) for i in range(94)), |
|
) |
|
) |
|
|
|
@validator("pronunciation", pre=True) |
|
def check_is_katakana(cls, pronunciation): |
|
if not fullmatch(r"[ァ-ヴー]+", pronunciation): |
|
raise ValueError("発音は有効なカタカナでなくてはいけません。") |
|
sutegana = ["ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ", "ッ"] |
|
for i in range(len(pronunciation)): |
|
if pronunciation[i] in sutegana: |
|
|
|
|
|
if i < len(pronunciation) - 1 and ( |
|
pronunciation[i + 1] in sutegana[:-1] |
|
or ( |
|
pronunciation[i] == sutegana[-1] |
|
and pronunciation[i + 1] == sutegana[-1] |
|
) |
|
): |
|
raise ValueError("無効な発音です。(捨て仮名の連続)") |
|
if pronunciation[i] == "ヮ": |
|
if i != 0 and pronunciation[i - 1] not in ["ク", "グ"]: |
|
raise ValueError( |
|
"無効な発音です。(「くゎ」「ぐゎ」以外の「ゎ」の使用)" |
|
) |
|
return pronunciation |
|
|
|
@validator("mora_count", pre=True, always=True) |
|
def check_mora_count_and_accent_type(cls, mora_count, values): |
|
if "pronunciation" not in values or "accent_type" not in values: |
|
|
|
return mora_count |
|
|
|
if mora_count is None: |
|
rule_others = ( |
|
"[イ][ェ]|[ヴ][ャュョ]|[トド][ゥ]|[テデ][ィャュョ]|[デ][ェ]|[クグ][ヮ]" |
|
) |
|
rule_line_i = "[キシチニヒミリギジビピ][ェャュョ]" |
|
rule_line_u = "[ツフヴ][ァ]|[ウスツフヴズ][ィ]|[ウツフヴ][ェォ]" |
|
rule_one_mora = "[ァ-ヴー]" |
|
mora_count = len( |
|
findall( |
|
f"(?:{rule_others}|{rule_line_i}|{rule_line_u}|{rule_one_mora})", |
|
values["pronunciation"], |
|
) |
|
) |
|
|
|
if not 0 <= values["accent_type"] <= mora_count: |
|
raise ValueError( |
|
"誤ったアクセント型です({})。 expect: 0 <= accent_type <= {}".format( |
|
values["accent_type"], mora_count |
|
) |
|
) |
|
return mora_count |
|
|
|
|
|
class PartOfSpeechDetail(BaseModel): |
|
""" |
|
品詞ごとの情報 |
|
""" |
|
|
|
part_of_speech: str = Field(title="品詞") |
|
part_of_speech_detail_1: str = Field(title="品詞細分類1") |
|
part_of_speech_detail_2: str = Field(title="品詞細分類2") |
|
part_of_speech_detail_3: str = Field(title="品詞細分類3") |
|
|
|
|
|
context_id: int = Field(title="文脈ID") |
|
cost_candidates: List[int] = Field(title="コストのパーセンタイル") |
|
accent_associative_rules: List[str] = Field(title="アクセント結合規則の一覧") |
|
|
|
|
|
class WordTypes(str, Enum): |
|
""" |
|
fastapiでword_type引数を検証する時に使用するクラス |
|
""" |
|
|
|
PROPER_NOUN = "PROPER_NOUN" |
|
COMMON_NOUN = "COMMON_NOUN" |
|
VERB = "VERB" |
|
ADJECTIVE = "ADJECTIVE" |
|
SUFFIX = "SUFFIX" |
|
|