insecta / khandy /text_utils.py
admin
sync
9d1ee0a
raw
history blame
No virus
1.02 kB
import re
def strip_content_in_paren(string):
"""
Notes:
strip_content_in_paren cannot process nested paren correctly
"""
return re.sub(r"\([^)]*\)|([^)]*)", "", string)
def is_chinese_char(uchar: str) -> bool:
"""Whether the input char is a Chinese character.
Args:
uchar: input char in unicode
References:
`is_chinese_char` in https://github.com/thunlp/OpenNRE/
"""
codepoint = ord(uchar)
if ((0x4E00 <= codepoint <= 0x9FFF) or # CJK Unified Ideographs
(0x3400 <= codepoint <= 0x4DBF) or # CJK Unified Ideographs Extension A
(0xF900 <= codepoint <= 0xFAFF) or # CJK Compatibility Ideographs
(0x20000 <= codepoint <= 0x2A6DF) or # CJK Unified Ideographs Extension B
(0x2A700 <= codepoint <= 0x2B73F) or
(0x2B740 <= codepoint <= 0x2B81F) or
(0x2B820 <= codepoint <= 0x2CEAF) or
(0x2F800 <= codepoint <= 0x2FA1F)): # CJK Compatibility Supplement
return True
return False