File size: 3,082 Bytes
0827183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from typing import Union

from .token import Token


class Tokenizer:
    """Provides a default tokenizer implementation."""

    @staticmethod
    def default_tokenizer(  # pylint: disable=unused-argument
        text: str, locale: str = None
    ) -> [Token]:
        """
        Simple tokenizer that breaks on spaces and punctuation. The only normalization is to lowercase.

        Parameter:
        ---------

        text: The input text.

        locale: (Optional) Identifies the locale of the input text.
        """
        tokens: [Token] = []
        token: Union[Token, None] = None

        # Parse text
        length: int = len(text) if text else 0
        i: int = 0

        while i < length:
            # Get both the UNICODE value of the current character and the complete character itself
            # which can potentially be multiple segments
            code_point = ord(text[i])
            char = chr(code_point)

            # Process current character
            if Tokenizer._is_breaking_char(code_point):
                # Character is in Unicode Plane 0 and is in an excluded block
                Tokenizer._append_token(tokens, token, i - 1)
                token = None
            elif code_point > 0xFFFF:
                # Character is in a Supplementary Unicode Plane. This is where emoji live so
                # we're going to just break each character in this range out as its own token
                Tokenizer._append_token(tokens, token, i - 1)
                token = None
                tokens.append(Token(start=i, end=i, text=char, normalized=char))
            elif token is None:
                # Start a new token
                token = Token(start=i, end=0, text=char, normalized=None)
            else:
                # Add onto current token
                token.text += char

            i += 1

        Tokenizer._append_token(tokens, token, length - 1)

        return tokens

    @staticmethod
    def _is_breaking_char(code_point) -> bool:
        return (
            Tokenizer._is_between(code_point, 0x0000, 0x002F)
            or Tokenizer._is_between(code_point, 0x003A, 0x0040)
            or Tokenizer._is_between(code_point, 0x005B, 0x0060)
            or Tokenizer._is_between(code_point, 0x007B, 0x00BF)
            or Tokenizer._is_between(code_point, 0x02B9, 0x036F)
            or Tokenizer._is_between(code_point, 0x2000, 0x2BFF)
            or Tokenizer._is_between(code_point, 0x2E00, 0x2E7F)
        )

    @staticmethod
    def _is_between(value: int, from_val: int, to_val: int) -> bool:
        """
        Parameters:
        -----------

        value: number value

        from: low range

        to: high range
        """
        return from_val <= value <= to_val

    @staticmethod
    def _append_token(tokens: [Token], token: Token, end: int):
        if token is not None:
            token.end = end
            token.normalized = token.text.lower()
            tokens.append(token)