Spaces:
Running
Running
import re | |
def strip_content_in_paren(string): | |
""" | |
Notes: | |
strip_content_in_paren cannot process nested paren correctly | |
""" | |
return re.sub(r"\([^)]*\)|([^)]*)", "", string) | |
def is_chinese_char(uchar: str) -> bool: | |
"""Whether the input char is a Chinese character. | |
Args: | |
uchar: input char in unicode | |
References: | |
`is_chinese_char` in https://github.com/thunlp/OpenNRE/ | |
""" | |
codepoint = ord(uchar) | |
if ((0x4E00 <= codepoint <= 0x9FFF) or # CJK Unified Ideographs | |
(0x3400 <= codepoint <= 0x4DBF) or # CJK Unified Ideographs Extension A | |
(0xF900 <= codepoint <= 0xFAFF) or # CJK Compatibility Ideographs | |
(0x20000 <= codepoint <= 0x2A6DF) or # CJK Unified Ideographs Extension B | |
(0x2A700 <= codepoint <= 0x2B73F) or | |
(0x2B740 <= codepoint <= 0x2B81F) or | |
(0x2B820 <= codepoint <= 0x2CEAF) or | |
(0x2F800 <= codepoint <= 0x2FA1F)): # CJK Compatibility Supplement | |
return True | |
return False | |