litagin's picture
init
2916d61
raw
history blame
5.14 kB
from enum import Enum
from re import findall, fullmatch
from typing import List, Optional
from pydantic import BaseModel, Field, validator
USER_DICT_MIN_PRIORITY = 0
USER_DICT_MAX_PRIORITY = 10
class UserDictWord(BaseModel):
"""
辞書のコンパイルに使われる情報
"""
surface: str = Field(title="表層形")
priority: int = Field(
title="優先度", ge=USER_DICT_MIN_PRIORITY, le=USER_DICT_MAX_PRIORITY
)
context_id: int = Field(title="文脈ID", default=1348)
part_of_speech: str = Field(title="品詞")
part_of_speech_detail_1: str = Field(title="品詞細分類1")
part_of_speech_detail_2: str = Field(title="品詞細分類2")
part_of_speech_detail_3: str = Field(title="品詞細分類3")
inflectional_type: str = Field(title="活用型")
inflectional_form: str = Field(title="活用形")
stem: str = Field(title="原形")
yomi: str = Field(title="読み")
pronunciation: str = Field(title="発音")
accent_type: int = Field(title="アクセント型")
mora_count: Optional[int] = Field(title="モーラ数", default=None)
accent_associative_rule: str = Field(title="アクセント結合規則")
class Config:
validate_assignment = True
@validator("surface")
def convert_to_zenkaku(cls, surface):
return surface.translate(
str.maketrans(
"".join(chr(0x21 + i) for i in range(94)),
"".join(chr(0xFF01 + i) for i in range(94)),
)
)
@validator("pronunciation", pre=True)
def check_is_katakana(cls, pronunciation):
if not fullmatch(r"[ァ-ヴー]+", pronunciation):
raise ValueError("発音は有効なカタカナでなくてはいけません。")
sutegana = ["ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ", "ッ"]
for i in range(len(pronunciation)):
if pronunciation[i] in sutegana:
# 「キャット」のように、捨て仮名が連続する可能性が考えられるので、
# 「ッ」に関しては「ッ」そのものが連続している場合と、「ッ」の後にほかの捨て仮名が連続する場合のみ無効とする
if i < len(pronunciation) - 1 and (
pronunciation[i + 1] in sutegana[:-1]
or (
pronunciation[i] == sutegana[-1]
and pronunciation[i + 1] == sutegana[-1]
)
):
raise ValueError("無効な発音です。(捨て仮名の連続)")
if pronunciation[i] == "ヮ":
if i != 0 and pronunciation[i - 1] not in ["ク", "グ"]:
raise ValueError(
"無効な発音です。(「くゎ」「ぐゎ」以外の「ゎ」の使用)"
)
return pronunciation
@validator("mora_count", pre=True, always=True)
def check_mora_count_and_accent_type(cls, mora_count, values):
if "pronunciation" not in values or "accent_type" not in values:
# 適切な場所でエラーを出すようにする
return mora_count
if mora_count is None:
rule_others = (
"[イ][ェ]|[ヴ][ャュョ]|[トド][ゥ]|[テデ][ィャュョ]|[デ][ェ]|[クグ][ヮ]"
)
rule_line_i = "[キシチニヒミリギジビピ][ェャュョ]"
rule_line_u = "[ツフヴ][ァ]|[ウスツフヴズ][ィ]|[ウツフヴ][ェォ]"
rule_one_mora = "[ァ-ヴー]"
mora_count = len(
findall(
f"(?:{rule_others}|{rule_line_i}|{rule_line_u}|{rule_one_mora})",
values["pronunciation"],
)
)
if not 0 <= values["accent_type"] <= mora_count:
raise ValueError(
"誤ったアクセント型です({})。 expect: 0 <= accent_type <= {}".format(
values["accent_type"], mora_count
)
)
return mora_count
class PartOfSpeechDetail(BaseModel):
"""
品詞ごとの情報
"""
part_of_speech: str = Field(title="品詞")
part_of_speech_detail_1: str = Field(title="品詞細分類1")
part_of_speech_detail_2: str = Field(title="品詞細分類2")
part_of_speech_detail_3: str = Field(title="品詞細分類3")
# context_idは辞書の左・右文脈IDのこと
# https://github.com/VOICEVOX/open_jtalk/blob/427cfd761b78efb6094bea3c5bb8c968f0d711ab/src/mecab-naist-jdic/_left-id.def # noqa
context_id: int = Field(title="文脈ID")
cost_candidates: List[int] = Field(title="コストのパーセンタイル")
accent_associative_rules: List[str] = Field(title="アクセント結合規則の一覧")
class WordTypes(str, Enum):
"""
fastapiでword_type引数を検証する時に使用するクラス
"""
PROPER_NOUN = "PROPER_NOUN"
COMMON_NOUN = "COMMON_NOUN"
VERB = "VERB"
ADJECTIVE = "ADJECTIVE"
SUFFIX = "SUFFIX"