import re def strip_content_in_paren(string): """ Notes: strip_content_in_paren cannot process nested paren correctly """ return re.sub(r"\([^)]*\)|([^)]*)", "", string) def is_chinese_char(uchar: str) -> bool: """Whether the input char is a Chinese character. Args: uchar: input char in unicode References: `is_chinese_char` in https://github.com/thunlp/OpenNRE/ """ codepoint = ord(uchar) if ((0x4E00 <= codepoint <= 0x9FFF) or # CJK Unified Ideographs (0x3400 <= codepoint <= 0x4DBF) or # CJK Unified Ideographs Extension A (0xF900 <= codepoint <= 0xFAFF) or # CJK Compatibility Ideographs (0x20000 <= codepoint <= 0x2A6DF) or # CJK Unified Ideographs Extension B (0x2A700 <= codepoint <= 0x2B73F) or (0x2B740 <= codepoint <= 0x2B81F) or (0x2B820 <= codepoint <= 0x2CEAF) or (0x2F800 <= codepoint <= 0x2FA1F)): # CJK Compatibility Supplement return True return False