il-hoon's picture
Upload 79 files
9dbd91b
raw
history blame contribute delete
No virus
9.15 kB
# coding: utf-8
# Code based on
import re
import os
import ast
import json
from jamo import hangul_to_jamo, h2j, j2h
from .ko_dictionary import english_dictionary, etc_dictionary
PAD = '_'
EOS = '~'
PUNC = '!\'(),-.:;?'
SPACE = ' '
JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
ALL_SYMBOLS = PAD + EOS + VALID_CHARS
char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}
id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}
quote_checker = """([`"'๏ผ‚โ€œโ€˜])(.+?)([`"'๏ผ‚โ€โ€™])"""
def is_lead(char):
return char in JAMO_LEADS
def is_vowel(char):
return char in JAMO_VOWELS
def is_tail(char):
return char in JAMO_TAILS
def get_mode(char):
if is_lead(char):
return 0
elif is_vowel(char):
return 1
elif is_tail(char):
return 2
else:
return -1
def _get_text_from_candidates(candidates):
if len(candidates) == 0:
return ""
elif len(candidates) == 1:
return _jamo_char_to_hcj(candidates[0])
else:
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
def jamo_to_korean(text):
text = h2j(text)
idx = 0
new_text = ""
candidates = []
while True:
if idx >= len(text):
new_text += _get_text_from_candidates(candidates)
break
char = text[idx]
mode = get_mode(char)
if mode == 0:
new_text += _get_text_from_candidates(candidates)
candidates = [char]
elif mode == -1:
new_text += _get_text_from_candidates(candidates)
new_text += char
candidates = []
else:
candidates.append(char)
idx += 1
return new_text
num_to_kor = {
'0': '์˜',
'1': '์ผ',
'2': '์ด',
'3': '์‚ผ',
'4': '์‚ฌ',
'5': '์˜ค',
'6': '์œก',
'7': '์น ',
'8': 'ํŒ”',
'9': '๊ตฌ',
}
unit_to_kor1 = {
'%': 'ํผ์„ผํŠธ',
'cm': '์„ผ์น˜๋ฏธํ„ฐ',
'mm': '๋ฐ€๋ฆฌ๋ฏธํ„ฐ',
'km': 'ํ‚ฌ๋กœ๋ฏธํ„ฐ',
'kg': 'ํ‚ฌ๋กœ๊ทธ๋žŒ',
}
unit_to_kor2 = {
'm': '๋ฏธํ„ฐ',
}
upper_to_kor = {
'A': '์—์ด',
'B': '๋น„',
'C': '์”จ',
'D': '๋””',
'E': '์ด',
'F': '์—ํ”„',
'G': '์ง€',
'H': '์—์ด์น˜',
'I': '์•„์ด',
'J': '์ œ์ด',
'K': '์ผ€์ด',
'L': '์—˜',
'M': '์— ',
'N': '์—”',
'O': '์˜ค',
'P': 'ํ”ผ',
'Q': 'ํ',
'R': '์•Œ',
'S': '์—์Šค',
'T': 'ํ‹ฐ',
'U': '์œ ',
'V': '๋ธŒ์ด',
'W': '๋”๋ธ”์œ ',
'X': '์—‘์Šค',
'Y': '์™€์ด',
'Z': '์ง€',
}
def compare_sentence_with_jamo(text1, text2):
return h2j(text1) != h2j(text2)
def tokenize(text, as_id=False):
# jamo package์— ์žˆ๋Š” hangul_to_jamo๋ฅผ ์ด์šฉํ•˜์—ฌ ํ•œ๊ธ€ string์„ ์ดˆ์„ฑ/์ค‘์„ฑ/์ข…์„ฑ์œผ๋กœ ๋‚˜๋ˆˆ๋‹ค.
text = normalize(text)
tokens = list(hangul_to_jamo(text)) # '์กด๊ฒฝํ•˜๋Š”' --> ['แ„Œ', 'แ…ฉ', 'แ†ซ', 'แ„€', 'แ…ง', 'แ†ผ', 'แ„’', 'แ…ก', 'แ„‚', 'แ…ณ', 'แ†ซ', '~']
if as_id:
return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
else:
return [token for token in tokens] + [EOS]
def tokenizer_fn(iterator):
return (token for x in iterator for token in tokenize(x, as_id=False))
def normalize(text):
text = text.strip()
text = re.sub('\(\d+์ผ\)', '', text)
text = re.sub('\([โบ€-โบ™โบ›-โปณโผ€-โฟ•ใ€…ใ€‡ใ€ก-ใ€ฉใ€ธ-ใ€บใ€ปใ€-ไถตไธ€-้ฟƒ่ฑˆ-้ถดไพฎ-้ ปไธฆ-้พŽ]+\)', '', text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = re.sub('[a-zA-Z]+', normalize_upper, text)
text = normalize_quote(text)
text = normalize_number(text)
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile('|'.join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
else:
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
else:
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
def normalize_upper(text):
text = text.group(0)
if all([char.isupper() for char in text]):
return "".join(upper_to_kor[char] for char in text)
else:
return text
def normalize_quote(text):
def fn(found_text):
from nltk import sent_tokenize # NLTK doesn't along with multiprocessing
found_text = found_text.group()
unquoted_text = found_text[1:-1]
sentences = sent_tokenize(unquoted_text)
return " ".join(["'{}'".format(sent) for sent in sentences])
return re.sub(quote_checker, fn, text)
number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
count_checker = "(์‹œ|๋ช…|๊ฐ€์ง€|์‚ด|๋งˆ๋ฆฌ|ํฌ๊ธฐ|์†ก์ด|์ˆ˜|ํ†จ|ํ†ต|์ |๊ฐœ|๋ฒŒ|์ฒ™|์ฑ„|๋‹ค๋ฐœ|๊ทธ๋ฃจ|์ž๋ฃจ|์ค„|์ผค๋ ˆ|๊ทธ๋ฆ‡|์ž”|๋งˆ๋””|์ƒ์ž|์‚ฌ๋žŒ|๊ณก|๋ณ‘|ํŒ)"
def normalize_number(text):
text = normalize_with_dictionary(text, unit_to_kor1)
text = normalize_with_dictionary(text, unit_to_kor2)
text = re.sub(number_checker + count_checker,
lambda x: number_to_korean(x, True), text)
text = re.sub(number_checker,
lambda x: number_to_korean(x, False), text)
return text
num_to_kor1 = [""] + list("์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ")
num_to_kor2 = [""] + list("๋งŒ์–ต์กฐ๊ฒฝํ•ด")
num_to_kor3 = [""] + list("์‹ญ๋ฐฑ์ฒœ")
# count_to_kor1 = [""] + ["ํ•˜๋‚˜","๋‘˜","์…‹","๋„ท","๋‹ค์„ฏ","์—ฌ์„ฏ","์ผ๊ณฑ","์—ฌ๋Ÿ","์•„ํ™‰"]
count_to_kor1 = [""] + ["ํ•œ", "๋‘", "์„ธ", "๋„ค", "๋‹ค์„ฏ", "์—ฌ์„ฏ", "์ผ๊ณฑ", "์—ฌ๋Ÿ", "์•„ํ™‰"]
count_tenth_dict = {
"์‹ญ": "์—ด",
"๋‘์‹ญ": "์Šค๋ฌผ",
"์„ธ์‹ญ": "์„œ๋ฅธ",
"๋„ค์‹ญ": "๋งˆํ”",
"๋‹ค์„ฏ์‹ญ": "์‰ฐ",
"์—ฌ์„ฏ์‹ญ": "์˜ˆ์ˆœ",
"์ผ๊ณฑ์‹ญ": "์ผํ”",
"์—ฌ๋Ÿ์‹ญ": "์—ฌ๋“ ",
"์•„ํ™‰์‹ญ": "์•„ํ”",
}
def number_to_korean(num_str, is_count=False):
if is_count:
num_str, unit_str = num_str.group(1), num_str.group(2)
else:
num_str, unit_str = num_str.group(), ""
num_str = num_str.replace(',', '')
# print("before ast : ", num_str, "dtype : ",type(num_str))
try:
num = ast.literal_eval(num_str)
# print("After ast :", num,"dtype : ",type(num))
except Exception:
num_str = re.sub('^0+', '', num_str)
num = ast.literal_eval(num_str)
if num == 0:
return "์˜"
check_float = num_str.split('.')
if len(check_float) == 2:
digit_str, float_str = check_float
elif len(check_float) >= 3:
raise Exception(" [!] Wrong number format")
else:
digit_str, float_str = check_float[0], None
if is_count and float_str is not None:
raise Exception(" [!] `is_count` and float number does not fit each other")
digit = int(digit_str)
if digit_str.startswith("-"):
digit, digit_str = abs(digit), str(abs(digit))
kor = ""
size = len(str(digit))
tmp = []
for i, v in enumerate(digit_str, start=1):
v = int(v)
if v != 0:
if is_count:
tmp += count_to_kor1[v]
else:
tmp += num_to_kor1[v]
tmp += num_to_kor3[(size - i) % 4]
if (size - i) % 4 == 0 and len(tmp) != 0:
kor += "".join(tmp)
tmp = []
kor += num_to_kor2[int((size - i) / 4)]
if is_count:
if kor.startswith("ํ•œ") and len(kor) > 1:
kor = kor[1:]
if any(word in kor for word in count_tenth_dict):
kor = re.sub(
'|'.join(count_tenth_dict.keys()),
lambda x: count_tenth_dict[x.group()], kor)
if not is_count and kor.startswith("์ผ") and len(kor) > 1:
kor = kor[1:]
if float_str is not None:
kor += "์ฉœ "
kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)
if num_str.startswith("+"):
kor = "ํ”Œ๋Ÿฌ์Šค " + kor
elif num_str.startswith("-"):
kor = "๋งˆ์ด๋„ˆ์Šค " + kor
return kor + unit_str
if __name__ == "__main__":
def test_normalize(text):
print(text)
print(normalize(text))
print("=" * 30)
test_normalize("JTBC๋Š” JTBCs๋ฅผ DY๋Š” A๊ฐ€ Absolute")
test_normalize("์˜ค๋Š˜(13์ผ) 3,600๋งˆ๋ฆฌ ๊ฐ•์•„์ง€๊ฐ€")
test_normalize("60.3%")
test_normalize('"์ €๋Œ"(็Œช็ช) ์ž…๋‹ˆ๋‹ค.')
test_normalize('๋น„๋Œ€์œ„์›์žฅ์ด ์ง€๋‚œ 1์›” ์ด๋Ÿฐ ๋ง์„ ํ–ˆ์Šต๋‹ˆ๋‹ค. โ€œ๋‚œ ๊ทธ๋ƒฅ ์‚ฐ๋ผ์ง€์ฒ˜๋Ÿผ ๋ŒํŒŒํ•˜๋Š” ์Šคํƒ€์ผ์ด๋‹คโ€')
test_normalize("์ง€๊ธˆ์€ -12.35%์˜€๊ณ  ์ข…๋ฅ˜๋Š” 5๊ฐ€์ง€์™€ 19๊ฐ€์ง€, ๊ทธ๋ฆฌ๊ณ  55๊ฐ€์ง€์˜€๋‹ค")
test_normalize("JTBC๋Š” TH์™€ K ์–‘์ด 2017๋…„ 9์›” 12์ผ ์˜คํ›„ 12์‹œ์— 24์‚ด์ด ๋œ๋‹ค")
print(list(hangul_to_jamo(list(hangul_to_jamo('๋น„๋Œ€์œ„์›์žฅ์ด ์ง€๋‚œ 1์›” ์ด๋Ÿฐ ๋ง์„ ํ–ˆ์Šต๋‹ˆ๋‹ค? โ€œ๋‚œ ๊ทธ๋ƒฅ ์‚ฐ๋ผ์ง€์ฒ˜๋Ÿผ ๋ŒํŒŒํ•˜๋Š” ์Šคํƒ€์ผ์ด๋‹คโ€')))))