Spaces:

MuGeminorum
/

insecta

Running

insecta / khandy /text_utils.py

admin

sync

9d1ee0a about 2 months ago

No virus

1.02 kB

	import re


	def strip_content_in_paren(string):
	"""
	Notes:
	strip_content_in_paren cannot process nested paren correctly
	"""
	return re.sub(r"\([^)]\)\|（[^）]）", "", string)


	def is_chinese_char(uchar: str) -> bool:
	"""Whether the input char is a Chinese character.

	Args:
	uchar: input char in unicode

	References:
	`is_chinese_char` in https://github.com/thunlp/OpenNRE/
	"""
	codepoint = ord(uchar)
	if ((0x4E00 <= codepoint <= 0x9FFF) or # CJK Unified Ideographs
	(0x3400 <= codepoint <= 0x4DBF) or # CJK Unified Ideographs Extension A
	(0xF900 <= codepoint <= 0xFAFF) or # CJK Compatibility Ideographs
	(0x20000 <= codepoint <= 0x2A6DF) or # CJK Unified Ideographs Extension B
	(0x2A700 <= codepoint <= 0x2B73F) or
	(0x2B740 <= codepoint <= 0x2B81F) or
	(0x2B820 <= codepoint <= 0x2CEAF) or
	(0x2F800 <= codepoint <= 0x2FA1F)): # CJK Compatibility Supplement
	return True
	return False