cryptocalypse commited on
Commit
103c053
1 Parent(s): e865108

libs entropy and read files

Browse files
Files changed (2) hide show
  1. lib/entropy.py +131 -0
  2. lib/files.py +31 -0
lib/entropy.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import math
3
+
4
+ class TextProcessor:
5
+ def __init__(self, texto):
6
+ self.texto = texto
7
+
8
+ def entropy(self):
9
+ simbolos = {}
10
+ total_caracteres = len(self.texto)
11
+
12
+ for caracter in self.texto:
13
+ simbolos[caracter] = simbolos.get(caracter, 0) + 1
14
+
15
+ entropia = 0
16
+ for count in simbolos.values():
17
+ probabilidad = count / total_caracteres
18
+ entropia -= probabilidad * math.log2(probabilidad)
19
+
20
+ return simbolos, entropia
21
+
22
+ def common_string(self, cadena1, cadena2):
23
+ longitud1 = len(cadena1)
24
+ longitud2 = len(cadena2)
25
+ comun = ''
26
+ subcadenas_comunes = []
27
+
28
+ for i in range(longitud1):
29
+ for j in range(longitud2):
30
+ k = 0
31
+ while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
32
+ k += 1
33
+ if k > 0:
34
+ subcadenas_comunes.append(cadena1[i:i+k])
35
+
36
+ if subcadenas_comunes:
37
+ comun = max(subcadenas_comunes, key=len)
38
+
39
+ return comun
40
+
41
+ def magic_split(self):
42
+ unique_symbols = set(self.texto)
43
+ symbol_distances = {}
44
+ for symbol in unique_symbols:
45
+ indices = [i for i, char in enumerate(self.texto) if char == symbol]
46
+ if len(indices) > 1:
47
+ distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
48
+ symbol_distances[symbol] = distances
49
+
50
+ variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
51
+
52
+ mins = {}
53
+ for v in variation:
54
+ if variation[v]!=0 and variation[v]!=1:
55
+ mins[v] = variation[v]
56
+
57
+ best_symbol = min(mins, key=mins.get)
58
+
59
+ return best_symbol
60
+
61
+ def rotate_string(self, string, n):
62
+ indice = n % len(string)
63
+ string_rotado = string[indice:] + string[:indice]
64
+ return string_rotado
65
+
66
+ def rotate_compare(self, tokiA, tokiB):
67
+ if tokiA >= tokiB:
68
+ tokA = tokiA
69
+ tokB = tokiB
70
+ ltokA = len(tokA)
71
+ else:
72
+ tokA = tokiB
73
+ tokB = tokiA
74
+ ltokA = len(tokB)
75
+
76
+ i = 0
77
+ rotations = {}
78
+ while i < ltokA:
79
+ tokrotated = self.rotate_string(tokA, i)
80
+ rotations[str(i)] = self.common_string(tokrotated, tokB)
81
+ i += 1
82
+
83
+ best_r = ""
84
+ for x in rotations:
85
+ lb = len(best_r)
86
+ rot = rotations[x]
87
+ lrot = len(rot)
88
+ if lrot > 1 and lrot < ltokA and lrot > lb:
89
+ best_r = rot
90
+
91
+ return best_r
92
+
93
+ def get_subTokens(self, spl):
94
+ sub_tokens = self.texto.split(spl)
95
+ toks = []
96
+ for tok in sub_tokens:
97
+ for tok2 in sub_tokens:
98
+ if tok != tok2:
99
+ toks.append(self.rotate_compare(tok, tok2))
100
+
101
+ return list(set(toks))
102
+
103
+ def tokenize(self, spliter_optimo):
104
+ tokens = self.get_subTokens(spliter_optimo)
105
+ tokenized_sentence = {}
106
+ chunk = self.texto.split(spliter_optimo)
107
+ for txt in chunk:
108
+ best_split = ""
109
+ for tok in tokens:
110
+ if tok != "":
111
+ lt = len(tok)
112
+ lb = len(best_split)
113
+ spltxt = txt.split(tok)
114
+ if len(spltxt) > 1:
115
+ l0 = len(spltxt[0])
116
+ l1 = len(spltxt[1])
117
+ if lt < len(txt) and lt > lb:
118
+ best_split = tok
119
+ tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
120
+ return tokenized_sentence
121
+
122
+
123
+ # Example usage:
124
+ texto_ejemplo = sys.argv[1]
125
+
126
+ text_processor = TextProcessor(texto_ejemplo)
127
+ spliter_optimo = text_processor.magic_split()
128
+ print("Spliter óptimo:", spliter_optimo)
129
+ print(text_processor.entropy())
130
+ print(text_processor.tokenize(spliter_optimo))
131
+
lib/files.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ class TextFinder:
4
+ def __init__(self, folder):
5
+ self.folder = folder
6
+
7
+ def find_matches(self, text):
8
+ matches = []
9
+ files = os.listdir(self.folder)
10
+
11
+ for file in files:
12
+ file_path = os.path.join(self.folder, file)
13
+ if os.path.isfile(file_path):
14
+ with open(file_path, 'r', encoding='utf-8') as f:
15
+ content = f.read()
16
+ index = content.find(text)
17
+ while index != -1:
18
+ start = max(content.rfind('\n', 0, index), content.rfind('.', 0, index))
19
+ end = min(content.find('\n', index), content.find('.', index))
20
+ if start != -1 and end != -1:
21
+ matches.append(content[start+1:end].strip())
22
+ index = content.find(text, index + 1)
23
+
24
+ return matches
25
+
26
+ # Example usage:
27
+ if __name__ == "__main__":
28
+ finder = TextFinder('example_folder')
29
+ matches = finder.find_matches('text_to_find')
30
+ print(matches)
31
+