tokenizer-arena / utils /text_util.py
eson's picture
remove vocabs; update compression_app; add character_app;
2bd606a
"""
char_
"""
def detect_lang_from_unicode():
pass
def is_digit_char(uchar):
return uchar in "0123456789"
def contains_digit(text):
return any(is_digit_char(ch) for ch in text)
def get_digit_count(text):
pass
def is_all_digit(text):
return all(is_digit_char(char) for char in text)
def get_digit_count(text):
digit_count = 0
for char in text:
if char in "0123456789":
digit_count += 1
return digit_count
def is_space_char(uchar):
"""
https://emptycharacter.com/
"""
def has_space(text):
pass
def is_all_space(text):
pass
def get_space_count(text):
space_count = 0
for char in text:
if len(char.strip()) == 0:
space_count += 1
return space_count