import sys import math class TextProcessor: def __init__(self, texto): self.texto = texto def entropy(self): simbolos = {} total_caracteres = len(self.texto) for caracter in self.texto: simbolos[caracter] = simbolos.get(caracter, 0) + 1 entropia = 0 for count in simbolos.values(): probabilidad = count / total_caracteres entropia -= probabilidad * math.log2(probabilidad) return simbolos, entropia def common_string(self, cadena1, cadena2): longitud1 = len(cadena1) longitud2 = len(cadena2) comun = '' subcadenas_comunes = [] for i in range(longitud1): for j in range(longitud2): k = 0 while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]): k += 1 if k > 0: subcadenas_comunes.append(cadena1[i:i+k]) if subcadenas_comunes: comun = max(subcadenas_comunes, key=len) return comun def magic_split(self): unique_symbols = set(self.texto) symbol_distances = {} for symbol in unique_symbols: indices = [i for i, char in enumerate(self.texto) if char == symbol] if len(indices) > 1: distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)] symbol_distances[symbol] = distances variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances} mins = {} for v in variation: if variation[v]!=0 and variation[v]!=1: mins[v] = variation[v] best_symbol = min(mins, key=mins.get) return best_symbol def rotate_string(self, string, n): indice = n % len(string) string_rotado = string[indice:] + string[:indice] return string_rotado def rotate_compare(self, tokiA, tokiB): if tokiA >= tokiB: tokA = tokiA tokB = tokiB ltokA = len(tokA) else: tokA = tokiB tokB = tokiA ltokA = len(tokB) i = 0 rotations = {} while i < ltokA: tokrotated = self.rotate_string(tokA, i) rotations[str(i)] = self.common_string(tokrotated, tokB) i += 1 best_r = "" for x in rotations: lb = len(best_r) rot = rotations[x] lrot = len(rot) if lrot > 1 and lrot < ltokA and lrot > lb: best_r = rot return best_r def get_subTokens(self, spl): sub_tokens = self.texto.split(spl) toks = [] for tok in sub_tokens: for tok2 in sub_tokens: if tok != tok2: toks.append(self.rotate_compare(tok, tok2)) return list(set(toks)) def tokenize(self, spliter_optimo): tokens = self.get_subTokens(spliter_optimo) tokenized_sentence = {} chunk = self.texto.split(spliter_optimo) for txt in chunk: best_split = "" for tok in tokens: if tok != "": lt = len(tok) lb = len(best_split) spltxt = txt.split(tok) if len(spltxt) > 1: l0 = len(spltxt[0]) l1 = len(spltxt[1]) if lt < len(txt) and lt > lb: best_split = tok tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1] return tokenized_sentence