Spaces:

torahCodes
/

Torah_Codes

Runtime error

File size: 3,995 Bytes

103c053

import sys
import math

class TextProcessor:
    def __init__(self, texto):
        self.texto = texto

    def entropy(self):
        simbolos = {}
        total_caracteres = len(self.texto)

        for caracter in self.texto:
            simbolos[caracter] = simbolos.get(caracter, 0) + 1

        entropia = 0
        for count in simbolos.values():
            probabilidad = count / total_caracteres
            entropia -= probabilidad * math.log2(probabilidad)

        return simbolos, entropia

    def common_string(self, cadena1, cadena2):
        longitud1 = len(cadena1)
        longitud2 = len(cadena2)
        comun = ''
        subcadenas_comunes = []

        for i in range(longitud1):
            for j in range(longitud2):
                k = 0
                while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
                    k += 1
                if k > 0:
                    subcadenas_comunes.append(cadena1[i:i+k])

        if subcadenas_comunes:
            comun = max(subcadenas_comunes, key=len)

        return comun

    def magic_split(self):
        unique_symbols = set(self.texto)
        symbol_distances = {}
        for symbol in unique_symbols:
            indices = [i for i, char in enumerate(self.texto) if char == symbol]
            if len(indices) > 1:
                distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
                symbol_distances[symbol] = distances

        variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}

        mins = {}
        for v in variation:
            if variation[v]!=0 and variation[v]!=1:
                mins[v] = variation[v]

        best_symbol = min(mins, key=mins.get)

        return best_symbol

    def rotate_string(self, string, n):
        indice = n % len(string)
        string_rotado = string[indice:] + string[:indice]
        return string_rotado

    def rotate_compare(self, tokiA, tokiB):
        if tokiA >= tokiB:
            tokA = tokiA
            tokB = tokiB
            ltokA = len(tokA)
        else:
            tokA = tokiB
            tokB = tokiA
            ltokA = len(tokB)

        i = 0
        rotations = {}
        while i < ltokA:
            tokrotated = self.rotate_string(tokA, i)
            rotations[str(i)] = self.common_string(tokrotated, tokB) 
            i += 1

        best_r = ""
        for x in rotations:
            lb = len(best_r)
            rot = rotations[x]
            lrot = len(rot)
            if lrot > 1 and lrot < ltokA and lrot > lb:
                best_r = rot

        return best_r

    def get_subTokens(self, spl):
        sub_tokens = self.texto.split(spl)
        toks = []
        for tok in sub_tokens:
            for tok2 in sub_tokens:
                if tok != tok2:
                    toks.append(self.rotate_compare(tok, tok2))

        return list(set(toks))

    def tokenize(self, spliter_optimo):
        tokens = self.get_subTokens(spliter_optimo)
        tokenized_sentence = {}
        chunk = self.texto.split(spliter_optimo)
        for txt in chunk:
            best_split = ""
            for tok in tokens:
                if tok != "":
                    lt = len(tok)
                    lb = len(best_split)
                    spltxt = txt.split(tok)
                    if len(spltxt) > 1:
                        l0 = len(spltxt[0])
                        l1 = len(spltxt[1])
                        if lt < len(txt) and lt > lb:
                            best_split = tok
                            tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
        return tokenized_sentence


# Example usage:
texto_ejemplo = sys.argv[1]

text_processor = TextProcessor(texto_ejemplo)
spliter_optimo = text_processor.magic_split()
print("Spliter óptimo:", spliter_optimo)
print(text_processor.entropy())
print(text_processor.tokenize(spliter_optimo))