Spaces:
Runtime error
Runtime error
from enum import Enum | |
from re import findall, fullmatch | |
from typing import List, Optional | |
from pydantic import BaseModel, Field, validator | |
USER_DICT_MIN_PRIORITY = 0 | |
USER_DICT_MAX_PRIORITY = 10 | |
class UserDictWord(BaseModel): | |
""" | |
辞書のコンパイルに使われる情報 | |
""" | |
surface: str = Field(title="表層形") | |
priority: int = Field( | |
title="優先度", ge=USER_DICT_MIN_PRIORITY, le=USER_DICT_MAX_PRIORITY | |
) | |
context_id: int = Field(title="文脈ID", default=1348) | |
part_of_speech: str = Field(title="品詞") | |
part_of_speech_detail_1: str = Field(title="品詞細分類1") | |
part_of_speech_detail_2: str = Field(title="品詞細分類2") | |
part_of_speech_detail_3: str = Field(title="品詞細分類3") | |
inflectional_type: str = Field(title="活用型") | |
inflectional_form: str = Field(title="活用形") | |
stem: str = Field(title="原形") | |
yomi: str = Field(title="読み") | |
pronunciation: str = Field(title="発音") | |
accent_type: int = Field(title="アクセント型") | |
mora_count: Optional[int] = Field(title="モーラ数", default=None) | |
accent_associative_rule: str = Field(title="アクセント結合規則") | |
class Config: | |
validate_assignment = True | |
def convert_to_zenkaku(cls, surface): | |
return surface.translate( | |
str.maketrans( | |
"".join(chr(0x21 + i) for i in range(94)), | |
"".join(chr(0xFF01 + i) for i in range(94)), | |
) | |
) | |
def check_is_katakana(cls, pronunciation): | |
if not fullmatch(r"[ァ-ヴー]+", pronunciation): | |
raise ValueError("発音は有効なカタカナでなくてはいけません。") | |
sutegana = ["ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ", "ッ"] | |
for i in range(len(pronunciation)): | |
if pronunciation[i] in sutegana: | |
# 「キャット」のように、捨て仮名が連続する可能性が考えられるので、 | |
# 「ッ」に関しては「ッ」そのものが連続している場合と、「ッ」の後にほかの捨て仮名が連続する場合のみ無効とする | |
if i < len(pronunciation) - 1 and ( | |
pronunciation[i + 1] in sutegana[:-1] | |
or ( | |
pronunciation[i] == sutegana[-1] | |
and pronunciation[i + 1] == sutegana[-1] | |
) | |
): | |
raise ValueError("無効な発音です。(捨て仮名の連続)") | |
if pronunciation[i] == "ヮ": | |
if i != 0 and pronunciation[i - 1] not in ["ク", "グ"]: | |
raise ValueError( | |
"無効な発音です。(「くゎ」「ぐゎ」以外の「ゎ」の使用)" | |
) | |
return pronunciation | |
def check_mora_count_and_accent_type(cls, mora_count, values): | |
if "pronunciation" not in values or "accent_type" not in values: | |
# 適切な場所でエラーを出すようにする | |
return mora_count | |
if mora_count is None: | |
rule_others = ( | |
"[イ][ェ]|[ヴ][ャュョ]|[トド][ゥ]|[テデ][ィャュョ]|[デ][ェ]|[クグ][ヮ]" | |
) | |
rule_line_i = "[キシチニヒミリギジビピ][ェャュョ]" | |
rule_line_u = "[ツフヴ][ァ]|[ウスツフヴズ][ィ]|[ウツフヴ][ェォ]" | |
rule_one_mora = "[ァ-ヴー]" | |
mora_count = len( | |
findall( | |
f"(?:{rule_others}|{rule_line_i}|{rule_line_u}|{rule_one_mora})", | |
values["pronunciation"], | |
) | |
) | |
if not 0 <= values["accent_type"] <= mora_count: | |
raise ValueError( | |
"誤ったアクセント型です({})。 expect: 0 <= accent_type <= {}".format( | |
values["accent_type"], mora_count | |
) | |
) | |
return mora_count | |
class PartOfSpeechDetail(BaseModel): | |
""" | |
品詞ごとの情報 | |
""" | |
part_of_speech: str = Field(title="品詞") | |
part_of_speech_detail_1: str = Field(title="品詞細分類1") | |
part_of_speech_detail_2: str = Field(title="品詞細分類2") | |
part_of_speech_detail_3: str = Field(title="品詞細分類3") | |
# context_idは辞書の左・右文脈IDのこと | |
# https://github.com/VOICEVOX/open_jtalk/blob/427cfd761b78efb6094bea3c5bb8c968f0d711ab/src/mecab-naist-jdic/_left-id.def # noqa | |
context_id: int = Field(title="文脈ID") | |
cost_candidates: List[int] = Field(title="コストのパーセンタイル") | |
accent_associative_rules: List[str] = Field(title="アクセント結合規則の一覧") | |
class WordTypes(str, Enum): | |
""" | |
fastapiでword_type引数を検証する時に使用するクラス | |
""" | |
PROPER_NOUN = "PROPER_NOUN" | |
COMMON_NOUN = "COMMON_NOUN" | |
VERB = "VERB" | |
ADJECTIVE = "ADJECTIVE" | |
SUFFIX = "SUFFIX" | |