File size: 3,735 Bytes
8332c01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import sys
import math

class TextProcessor:
    def __init__(self, texto):
        self.texto = texto

    def entropy(self):
        simbolos = {}
        total_caracteres = len(self.texto)

        for caracter in self.texto:
            simbolos[caracter] = simbolos.get(caracter, 0) + 1

        entropia = 0
        for count in simbolos.values():
            probabilidad = count / total_caracteres
            entropia -= probabilidad * math.log2(probabilidad)

        return simbolos, entropia

    def common_string(self, cadena1, cadena2):
        longitud1 = len(cadena1)
        longitud2 = len(cadena2)
        comun = ''
        subcadenas_comunes = []

        for i in range(longitud1):
            for j in range(longitud2):
                k = 0
                while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
                    k += 1
                if k > 0:
                    subcadenas_comunes.append(cadena1[i:i+k])

        if subcadenas_comunes:
            comun = max(subcadenas_comunes, key=len)

        return comun

    def magic_split(self):
        unique_symbols = set(self.texto)
        symbol_distances = {}
        for symbol in unique_symbols:
            indices = [i for i, char in enumerate(self.texto) if char == symbol]
            if len(indices) > 1:
                distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
                symbol_distances[symbol] = distances

        variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}

        mins = {}
        for v in variation:
            if variation[v]!=0 and variation[v]!=1:
                mins[v] = variation[v]

        best_symbol = min(mins, key=mins.get)

        return best_symbol

    def rotate_string(self, string, n):
        indice = n % len(string)
        string_rotado = string[indice:] + string[:indice]
        return string_rotado

    def rotate_compare(self, tokiA, tokiB):
        if tokiA >= tokiB:
            tokA = tokiA
            tokB = tokiB
            ltokA = len(tokA)
        else:
            tokA = tokiB
            tokB = tokiA
            ltokA = len(tokB)

        i = 0
        rotations = {}
        while i < ltokA:
            tokrotated = self.rotate_string(tokA, i)
            rotations[str(i)] = self.common_string(tokrotated, tokB) 
            i += 1

        best_r = ""
        for x in rotations:
            lb = len(best_r)
            rot = rotations[x]
            lrot = len(rot)
            if lrot > 1 and lrot < ltokA and lrot > lb:
                best_r = rot

        return best_r

    def get_subTokens(self, spl):
        sub_tokens = self.texto.split(spl)
        toks = []
        for tok in sub_tokens:
            for tok2 in sub_tokens:
                if tok != tok2:
                    toks.append(self.rotate_compare(tok, tok2))

        return list(set(toks))

    def tokenize(self, spliter_optimo):
        tokens = self.get_subTokens(spliter_optimo)
        tokenized_sentence = {}
        chunk = self.texto.split(spliter_optimo)
        for txt in chunk:
            best_split = ""
            for tok in tokens:
                if tok != "":
                    lt = len(tok)
                    lb = len(best_split)
                    spltxt = txt.split(tok)
                    if len(spltxt) > 1:
                        l0 = len(spltxt[0])
                        l1 = len(spltxt[1])
                        if lt < len(txt) and lt > lb:
                            best_split = tok
                            tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
        return tokenized_sentence