cohit's picture
Upload folder using huggingface_hub
0827183 verified
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from typing import Union
from .token import Token
class Tokenizer:
"""Provides a default tokenizer implementation."""
@staticmethod
def default_tokenizer( # pylint: disable=unused-argument
text: str, locale: str = None
) -> [Token]:
"""
Simple tokenizer that breaks on spaces and punctuation. The only normalization is to lowercase.
Parameter:
---------
text: The input text.
locale: (Optional) Identifies the locale of the input text.
"""
tokens: [Token] = []
token: Union[Token, None] = None
# Parse text
length: int = len(text) if text else 0
i: int = 0
while i < length:
# Get both the UNICODE value of the current character and the complete character itself
# which can potentially be multiple segments
code_point = ord(text[i])
char = chr(code_point)
# Process current character
if Tokenizer._is_breaking_char(code_point):
# Character is in Unicode Plane 0 and is in an excluded block
Tokenizer._append_token(tokens, token, i - 1)
token = None
elif code_point > 0xFFFF:
# Character is in a Supplementary Unicode Plane. This is where emoji live so
# we're going to just break each character in this range out as its own token
Tokenizer._append_token(tokens, token, i - 1)
token = None
tokens.append(Token(start=i, end=i, text=char, normalized=char))
elif token is None:
# Start a new token
token = Token(start=i, end=0, text=char, normalized=None)
else:
# Add onto current token
token.text += char
i += 1
Tokenizer._append_token(tokens, token, length - 1)
return tokens
@staticmethod
def _is_breaking_char(code_point) -> bool:
return (
Tokenizer._is_between(code_point, 0x0000, 0x002F)
or Tokenizer._is_between(code_point, 0x003A, 0x0040)
or Tokenizer._is_between(code_point, 0x005B, 0x0060)
or Tokenizer._is_between(code_point, 0x007B, 0x00BF)
or Tokenizer._is_between(code_point, 0x02B9, 0x036F)
or Tokenizer._is_between(code_point, 0x2000, 0x2BFF)
or Tokenizer._is_between(code_point, 0x2E00, 0x2E7F)
)
@staticmethod
def _is_between(value: int, from_val: int, to_val: int) -> bool:
"""
Parameters:
-----------
value: number value
from: low range
to: high range
"""
return from_val <= value <= to_val
@staticmethod
def _append_token(tokens: [Token], token: Token, end: int):
if token is not None:
token.end = end
token.normalized = token.text.lower()
tokens.append(token)