Spaces:

vnosri
/

sophia_ai_robot_prophet

Running

App Files Files Community

sophia_ai_robot_prophet / lib /entropy.py

vnosri

sophia

8332c01 about 2 months ago

raw

history blame contribute delete

No virus

3.74 kB

	import sys
	import math

	class TextProcessor:
	def __init__(self, texto):
	self.texto = texto

	def entropy(self):
	simbolos = {}
	total_caracteres = len(self.texto)

	for caracter in self.texto:
	simbolos[caracter] = simbolos.get(caracter, 0) + 1

	entropia = 0
	for count in simbolos.values():
	probabilidad = count / total_caracteres
	entropia -= probabilidad * math.log2(probabilidad)

	return simbolos, entropia

	def common_string(self, cadena1, cadena2):
	longitud1 = len(cadena1)
	longitud2 = len(cadena2)
	comun = ''
	subcadenas_comunes = []

	for i in range(longitud1):
	for j in range(longitud2):
	k = 0
	while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
	k += 1
	if k > 0:
	subcadenas_comunes.append(cadena1[i:i+k])

	if subcadenas_comunes:
	comun = max(subcadenas_comunes, key=len)

	return comun

	def magic_split(self):
	unique_symbols = set(self.texto)
	symbol_distances = {}
	for symbol in unique_symbols:
	indices = [i for i, char in enumerate(self.texto) if char == symbol]
	if len(indices) > 1:
	distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
	symbol_distances[symbol] = distances

	variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}

	mins = {}
	for v in variation:
	if variation[v]!=0 and variation[v]!=1:
	mins[v] = variation[v]

	best_symbol = min(mins, key=mins.get)

	return best_symbol

	def rotate_string(self, string, n):
	indice = n % len(string)
	string_rotado = string[indice:] + string[:indice]
	return string_rotado

	def rotate_compare(self, tokiA, tokiB):
	if tokiA >= tokiB:
	tokA = tokiA
	tokB = tokiB
	ltokA = len(tokA)
	else:
	tokA = tokiB
	tokB = tokiA
	ltokA = len(tokB)

	i = 0
	rotations = {}
	while i < ltokA:
	tokrotated = self.rotate_string(tokA, i)
	rotations[str(i)] = self.common_string(tokrotated, tokB)
	i += 1

	best_r = ""
	for x in rotations:
	lb = len(best_r)
	rot = rotations[x]
	lrot = len(rot)
	if lrot > 1 and lrot < ltokA and lrot > lb:
	best_r = rot

	return best_r

	def get_subTokens(self, spl):
	sub_tokens = self.texto.split(spl)
	toks = []
	for tok in sub_tokens:
	for tok2 in sub_tokens:
	if tok != tok2:
	toks.append(self.rotate_compare(tok, tok2))

	return list(set(toks))

	def tokenize(self, spliter_optimo):
	tokens = self.get_subTokens(spliter_optimo)
	tokenized_sentence = {}
	chunk = self.texto.split(spliter_optimo)
	for txt in chunk:
	best_split = ""
	for tok in tokens:
	if tok != "":
	lt = len(tok)
	lb = len(best_split)
	spltxt = txt.split(tok)
	if len(spltxt) > 1:
	l0 = len(spltxt[0])
	l1 = len(spltxt[1])
	if lt < len(txt) and lt > lb:
	best_split = tok
	tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
	return tokenized_sentence