Spaces:
Runtime error
Runtime error
import sys | |
import math | |
class TextProcessor: | |
def __init__(self, texto): | |
self.texto = texto | |
def entropy(self): | |
simbolos = {} | |
total_caracteres = len(self.texto) | |
for caracter in self.texto: | |
simbolos[caracter] = simbolos.get(caracter, 0) + 1 | |
entropia = 0 | |
for count in simbolos.values(): | |
probabilidad = count / total_caracteres | |
entropia -= probabilidad * math.log2(probabilidad) | |
return simbolos, entropia | |
def common_string(self, cadena1, cadena2): | |
longitud1 = len(cadena1) | |
longitud2 = len(cadena2) | |
comun = '' | |
subcadenas_comunes = [] | |
for i in range(longitud1): | |
for j in range(longitud2): | |
k = 0 | |
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]): | |
k += 1 | |
if k > 0: | |
subcadenas_comunes.append(cadena1[i:i+k]) | |
if subcadenas_comunes: | |
comun = max(subcadenas_comunes, key=len) | |
return comun | |
def magic_split(self): | |
unique_symbols = set(self.texto) | |
symbol_distances = {} | |
for symbol in unique_symbols: | |
indices = [i for i, char in enumerate(self.texto) if char == symbol] | |
if len(indices) > 1: | |
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)] | |
symbol_distances[symbol] = distances | |
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances} | |
mins = {} | |
for v in variation: | |
if variation[v]!=0 and variation[v]!=1: | |
mins[v] = variation[v] | |
best_symbol = min(mins, key=mins.get) | |
return best_symbol | |
def rotate_string(self, string, n): | |
indice = n % len(string) | |
string_rotado = string[indice:] + string[:indice] | |
return string_rotado | |
def rotate_compare(self, tokiA, tokiB): | |
if tokiA >= tokiB: | |
tokA = tokiA | |
tokB = tokiB | |
ltokA = len(tokA) | |
else: | |
tokA = tokiB | |
tokB = tokiA | |
ltokA = len(tokB) | |
i = 0 | |
rotations = {} | |
while i < ltokA: | |
tokrotated = self.rotate_string(tokA, i) | |
rotations[str(i)] = self.common_string(tokrotated, tokB) | |
i += 1 | |
best_r = "" | |
for x in rotations: | |
lb = len(best_r) | |
rot = rotations[x] | |
lrot = len(rot) | |
if lrot > 1 and lrot < ltokA and lrot > lb: | |
best_r = rot | |
return best_r | |
def get_subTokens(self, spl): | |
sub_tokens = self.texto.split(spl) | |
toks = [] | |
for tok in sub_tokens: | |
for tok2 in sub_tokens: | |
if tok != tok2: | |
toks.append(self.rotate_compare(tok, tok2)) | |
return list(set(toks)) | |
def tokenize(self, spliter_optimo): | |
tokens = self.get_subTokens(spliter_optimo) | |
tokenized_sentence = {} | |
chunk = self.texto.split(spliter_optimo) | |
for txt in chunk: | |
best_split = "" | |
for tok in tokens: | |
if tok != "": | |
lt = len(tok) | |
lb = len(best_split) | |
spltxt = txt.split(tok) | |
if len(spltxt) > 1: | |
l0 = len(spltxt[0]) | |
l1 = len(spltxt[1]) | |
if lt < len(txt) and lt > lb: | |
best_split = tok | |
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1] | |
return tokenized_sentence | |
# Example usage: | |
texto_ejemplo = sys.argv[1] | |
text_processor = TextProcessor(texto_ejemplo) | |
spliter_optimo = text_processor.magic_split() | |
print("Spliter óptimo:", spliter_optimo) | |
print(text_processor.entropy()) | |
print(text_processor.tokenize(spliter_optimo)) | |