from zhon.hanzi import punctuation as zh_punc def is_zh_char(uchar): """ https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 re.compile("([\u4E00-\u9FD5]+)", re.U) """ return u'\u4e00' <= uchar <= u'\u9fa5' def has_zh(text): """ contains Chinese characters """ return any(is_zh_char(ch) for ch in text) def get_zh_count(text): return sum([is_zh_char(uchar) for uchar in text]) def is_all_zh(text): return all(is_zh_char(char) for char in text) def is_all_en(text): return text.encode('utf-8').isalpha() def is_digit_char(uchar): return uchar in "0123456789" def has_digit(text): return any(is_digit_char(ch) for ch in text) def is_all_digit(text): return all(is_digit_char(char) for char in text) def get_digit_count(text): digit_count = 0 for char in text: if char in "0123456789": digit_count += 1 return digit_count def has_zh_punc(text): """ 是否包含中文标点 """ return any(ch in zh_punc for ch in text) def is_space_char(uchar): """ https://emptycharacter.com/ """ def has_space(text): pass def is_all_space(text): pass def get_space_count(text): space_count = 0 for char in text: if len(char.strip()) == 0: space_count += 1 return space_count