litagin's picture
init
2916d61
raw
history blame
No virus
5.14 kB
from enum import Enum
from re import findall, fullmatch
from typing import List, Optional
from pydantic import BaseModel, Field, validator
USER_DICT_MIN_PRIORITY = 0
USER_DICT_MAX_PRIORITY = 10
class UserDictWord(BaseModel):
"""
辞書のコンパイルに使われる情報
"""
surface: str = Field(title="表層形")
priority: int = Field(
title="優先度", ge=USER_DICT_MIN_PRIORITY, le=USER_DICT_MAX_PRIORITY
)
context_id: int = Field(title="文脈ID", default=1348)
part_of_speech: str = Field(title="品詞")
part_of_speech_detail_1: str = Field(title="品詞細分類1")
part_of_speech_detail_2: str = Field(title="品詞細分類2")
part_of_speech_detail_3: str = Field(title="品詞細分類3")
inflectional_type: str = Field(title="活用型")
inflectional_form: str = Field(title="活用形")
stem: str = Field(title="原形")
yomi: str = Field(title="読み")
pronunciation: str = Field(title="発音")
accent_type: int = Field(title="アクセント型")
mora_count: Optional[int] = Field(title="モーラ数", default=None)
accent_associative_rule: str = Field(title="アクセント結合規則")
class Config:
validate_assignment = True
@validator("surface")
def convert_to_zenkaku(cls, surface):
return surface.translate(
str.maketrans(
"".join(chr(0x21 + i) for i in range(94)),
"".join(chr(0xFF01 + i) for i in range(94)),
)
)
@validator("pronunciation", pre=True)
def check_is_katakana(cls, pronunciation):
if not fullmatch(r"[ァ-ヴー]+", pronunciation):
raise ValueError("発音は有効なカタカナでなくてはいけません。")
sutegana = ["ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ", "ッ"]
for i in range(len(pronunciation)):
if pronunciation[i] in sutegana:
# 「キャット」のように、捨て仮名が連続する可能性が考えられるので、
# 「ッ」に関しては「ッ」そのものが連続している場合と、「ッ」の後にほかの捨て仮名が連続する場合のみ無効とする
if i < len(pronunciation) - 1 and (
pronunciation[i + 1] in sutegana[:-1]
or (
pronunciation[i] == sutegana[-1]
and pronunciation[i + 1] == sutegana[-1]
)
):
raise ValueError("無効な発音です。(捨て仮名の連続)")
if pronunciation[i] == "ヮ":
if i != 0 and pronunciation[i - 1] not in ["ク", "グ"]:
raise ValueError(
"無効な発音です。(「くゎ」「ぐゎ」以外の「ゎ」の使用)"
)
return pronunciation
@validator("mora_count", pre=True, always=True)
def check_mora_count_and_accent_type(cls, mora_count, values):
if "pronunciation" not in values or "accent_type" not in values:
# 適切な場所でエラーを出すようにする
return mora_count
if mora_count is None:
rule_others = (
"[イ][ェ]|[ヴ][ャュョ]|[トド][ゥ]|[テデ][ィャュョ]|[デ][ェ]|[クグ][ヮ]"
)
rule_line_i = "[キシチニヒミリギジビピ][ェャュョ]"
rule_line_u = "[ツフヴ][ァ]|[ウスツフヴズ][ィ]|[ウツフヴ][ェォ]"
rule_one_mora = "[ァ-ヴー]"
mora_count = len(
findall(
f"(?:{rule_others}|{rule_line_i}|{rule_line_u}|{rule_one_mora})",
values["pronunciation"],
)
)
if not 0 <= values["accent_type"] <= mora_count:
raise ValueError(
"誤ったアクセント型です({})。 expect: 0 <= accent_type <= {}".format(
values["accent_type"], mora_count
)
)
return mora_count
class PartOfSpeechDetail(BaseModel):
"""
品詞ごとの情報
"""
part_of_speech: str = Field(title="品詞")
part_of_speech_detail_1: str = Field(title="品詞細分類1")
part_of_speech_detail_2: str = Field(title="品詞細分類2")
part_of_speech_detail_3: str = Field(title="品詞細分類3")
# context_idは辞書の左・右文脈IDのこと
# https://github.com/VOICEVOX/open_jtalk/blob/427cfd761b78efb6094bea3c5bb8c968f0d711ab/src/mecab-naist-jdic/_left-id.def # noqa
context_id: int = Field(title="文脈ID")
cost_candidates: List[int] = Field(title="コストのパーセンタイル")
accent_associative_rules: List[str] = Field(title="アクセント結合規則の一覧")
class WordTypes(str, Enum):
"""
fastapiでword_type引数を検証する時に使用するクラス
"""
PROPER_NOUN = "PROPER_NOUN"
COMMON_NOUN = "COMMON_NOUN"
VERB = "VERB"
ADJECTIVE = "ADJECTIVE"
SUFFIX = "SUFFIX"