Spaces:

eson
/

tokenizer-arena

Running

tokenizer-arena / utils /text_util.py

add compress rate

814ee6b 2 months ago

No virus

1.35 kB


	from zhon.hanzi import punctuation as zh_punc

	def is_zh_char(uchar):
	"""
	https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
	re.compile("([\u4E00-\u9FD5]+)", re.U)
	"""
	return u'\u4e00' <= uchar <= u'\u9fa5'


	def has_zh(text):
	""" contains Chinese characters """
	return any(is_zh_char(ch) for ch in text)


	def get_zh_count(text):
	return sum([is_zh_char(uchar) for uchar in text])


	def is_all_zh(text):
	return all(is_zh_char(char) for char in text)


	def is_all_en(text):
	return text.encode('utf-8').isalpha()


	def is_digit_char(uchar):
	return uchar in "0123456789"


	def has_digit(text):
	return any(is_digit_char(ch) for ch in text)


	def is_all_digit(text):
	return all(is_digit_char(char) for char in text)


	def get_digit_count(text):
	digit_count = 0
	for char in text:
	if char in "0123456789":
	digit_count += 1
	return digit_count



	def has_zh_punc(text):
	"""
	是否包含中文标点
	"""
	return any(ch in zh_punc for ch in text)



	def is_space_char(uchar):
	"""
	https://emptycharacter.com/


	"""


	def has_space(text):
	pass

	def is_all_space(text):
	pass

	def get_space_count(text):
	space_count = 0
	for char in text:
	if len(char.strip()) == 0:
	space_count += 1
	return space_count