|
import sys |
|
import math |
|
|
|
class TextProcessor: |
|
def __init__(self, texto): |
|
self.texto = texto |
|
|
|
def entropy(self): |
|
simbolos = {} |
|
total_caracteres = len(self.texto) |
|
|
|
for caracter in self.texto: |
|
simbolos[caracter] = simbolos.get(caracter, 0) + 1 |
|
|
|
entropia = 0 |
|
for count in simbolos.values(): |
|
probabilidad = count / total_caracteres |
|
entropia -= probabilidad * math.log2(probabilidad) |
|
|
|
return simbolos, entropia |
|
|
|
def common_string(self, cadena1, cadena2): |
|
longitud1 = len(cadena1) |
|
longitud2 = len(cadena2) |
|
comun = '' |
|
subcadenas_comunes = [] |
|
|
|
for i in range(longitud1): |
|
for j in range(longitud2): |
|
k = 0 |
|
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]): |
|
k += 1 |
|
if k > 0: |
|
subcadenas_comunes.append(cadena1[i:i+k]) |
|
|
|
if subcadenas_comunes: |
|
comun = max(subcadenas_comunes, key=len) |
|
|
|
return comun |
|
|
|
def magic_split(self): |
|
unique_symbols = set(self.texto) |
|
symbol_distances = {} |
|
for symbol in unique_symbols: |
|
indices = [i for i, char in enumerate(self.texto) if char == symbol] |
|
if len(indices) > 1: |
|
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)] |
|
symbol_distances[symbol] = distances |
|
|
|
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances} |
|
|
|
mins = {} |
|
for v in variation: |
|
if variation[v]!=0 and variation[v]!=1: |
|
mins[v] = variation[v] |
|
|
|
best_symbol = min(mins, key=mins.get) |
|
|
|
return best_symbol |
|
|
|
def rotate_string(self, string, n): |
|
indice = n % len(string) |
|
string_rotado = string[indice:] + string[:indice] |
|
return string_rotado |
|
|
|
def rotate_compare(self, tokiA, tokiB): |
|
if tokiA >= tokiB: |
|
tokA = tokiA |
|
tokB = tokiB |
|
ltokA = len(tokA) |
|
else: |
|
tokA = tokiB |
|
tokB = tokiA |
|
ltokA = len(tokB) |
|
|
|
i = 0 |
|
rotations = {} |
|
while i < ltokA: |
|
tokrotated = self.rotate_string(tokA, i) |
|
rotations[str(i)] = self.common_string(tokrotated, tokB) |
|
i += 1 |
|
|
|
best_r = "" |
|
for x in rotations: |
|
lb = len(best_r) |
|
rot = rotations[x] |
|
lrot = len(rot) |
|
if lrot > 1 and lrot < ltokA and lrot > lb: |
|
best_r = rot |
|
|
|
return best_r |
|
|
|
def get_subTokens(self, spl): |
|
sub_tokens = self.texto.split(spl) |
|
toks = [] |
|
for tok in sub_tokens: |
|
for tok2 in sub_tokens: |
|
if tok != tok2: |
|
toks.append(self.rotate_compare(tok, tok2)) |
|
|
|
return list(set(toks)) |
|
|
|
def tokenize(self, spliter_optimo): |
|
tokens = self.get_subTokens(spliter_optimo) |
|
tokenized_sentence = {} |
|
chunk = self.texto.split(spliter_optimo) |
|
for txt in chunk: |
|
best_split = "" |
|
for tok in tokens: |
|
if tok != "": |
|
lt = len(tok) |
|
lb = len(best_split) |
|
spltxt = txt.split(tok) |
|
if len(spltxt) > 1: |
|
l0 = len(spltxt[0]) |
|
l1 = len(spltxt[1]) |
|
if lt < len(txt) and lt > lb: |
|
best_split = tok |
|
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1] |
|
return tokenized_sentence |
|
|
|
|
|
|