Spaces:
Running
Running
File size: 1,016 Bytes
9d1ee0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import re
def strip_content_in_paren(string):
"""
Notes:
strip_content_in_paren cannot process nested paren correctly
"""
return re.sub(r"\([^)]*\)|([^)]*)", "", string)
def is_chinese_char(uchar: str) -> bool:
"""Whether the input char is a Chinese character.
Args:
uchar: input char in unicode
References:
`is_chinese_char` in https://github.com/thunlp/OpenNRE/
"""
codepoint = ord(uchar)
if ((0x4E00 <= codepoint <= 0x9FFF) or # CJK Unified Ideographs
(0x3400 <= codepoint <= 0x4DBF) or # CJK Unified Ideographs Extension A
(0xF900 <= codepoint <= 0xFAFF) or # CJK Compatibility Ideographs
(0x20000 <= codepoint <= 0x2A6DF) or # CJK Unified Ideographs Extension B
(0x2A700 <= codepoint <= 0x2B73F) or
(0x2B740 <= codepoint <= 0x2B81F) or
(0x2B820 <= codepoint <= 0x2CEAF) or
(0x2F800 <= codepoint <= 0x2FA1F)): # CJK Compatibility Supplement
return True
return False
|