asafd60 commited on
Commit
c2f9838
1 Parent(s): 2938575

Upload 3 files

Browse files
Files changed (3) hide show
  1. SpaceGen_preprocessing.py +159 -0
  2. app.py +28 -0
  3. utils.py +71 -0
SpaceGen_preprocessing.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import OneHotEncoder
4
+
5
+ class SpaceGen_preprocessing:
6
+ def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
7
+ self.size = size
8
+ self.content = content[:self.size]
9
+ self.past_capacity = past_capacity
10
+ self.future_capacity = future_capacity
11
+ self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
12
+ self.vocabulary = []
13
+
14
+ def create_vocabulary(self, correct_txt):
15
+ '''
16
+ Returns the unique letters of the given text + '-1'
17
+ '''
18
+ vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
19
+ vocabulary.append(-1)
20
+ vocabulary = sorted(vocabulary)
21
+ self.vocabulary = vocabulary
22
+ return None
23
+
24
+ @staticmethod
25
+ def create_decision_vector(W: list, C: list):
26
+ '''
27
+ Returns the Decision Vector(D),
28
+ given Wrong Vector(W) and Correct Vector(C)
29
+ '''
30
+ D = []
31
+ w_i = 0
32
+ c_i = 0
33
+ while w_i < len(W):
34
+ if W[w_i] == C[c_i]:
35
+ D.append('K')
36
+ w_i += 1
37
+ c_i += 1
38
+ elif W[w_i] == 32 and C[c_i] != 32 :
39
+ D.append('D')
40
+ w_i += 1
41
+ elif C[c_i] == 32 and W[w_i] != 32:
42
+ D.append('I')
43
+ c_i += 1
44
+ w_i += 1
45
+ else:
46
+ c_i += 1
47
+ return D
48
+
49
+
50
+ @staticmethod
51
+ def to_correct(W, D):
52
+ '''
53
+ Returns the correct text,
54
+ given Wrong Vector(W) and Decision Vector(D)
55
+ '''
56
+ output_vec = []
57
+ for i in range(0, len(D)):
58
+ if D[i] == 'K':
59
+ output_vec.append(W[i])
60
+ elif D[i] == 'I':
61
+ output_vec.append(32)
62
+ output_vec.append(W[i])
63
+ elif D[i] == 'D':
64
+ pass
65
+ decoded_text = bytes(output_vec).decode()
66
+ return decoded_text
67
+
68
+
69
+ @staticmethod
70
+ def to_bytes_list(text: str, encoding = 'UTF-8'):
71
+ '''
72
+ Returns the bytes list of a given text
73
+ '''
74
+ return [b for b in bytes(text, encoding)]
75
+
76
+
77
+ @staticmethod
78
+ def to_one_hot_df(wrong_txt, D):
79
+ '''
80
+ Returns the one hot encoded dataframe,
81
+ given Wrong Vector(W) and Decision Vector(D)
82
+ '''
83
+ df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
84
+ encoding = OneHotEncoder()
85
+ y_matrix = encoding.fit_transform(df[['decision']])
86
+ onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
87
+ onehot_df = onehot_df.astype('int')
88
+ example_df = pd.concat([df, onehot_df], axis=1)
89
+ example_df =example_df.drop(['decision'], axis=1)
90
+ return example_df
91
+
92
+
93
+ @staticmethod
94
+ def decode_vec(arr):
95
+ '''
96
+ Returns the decoded text,
97
+ given the bytes list
98
+ '''
99
+ return bytes(arr).decode()
100
+
101
+
102
+ @staticmethod
103
+ def sliding_window_past(arr, window_size = 5):
104
+ '''
105
+ Returns the past sliding window of the given array and window size
106
+ '''
107
+ arr = list(arr)
108
+ new_arr = []
109
+ for i in range(len(arr)):
110
+ start_window = max(0, i- window_size)
111
+ tmp_seq = arr[start_window:i]
112
+ if window_size - len(tmp_seq) ==0:
113
+ new_arr.append(tmp_seq)
114
+ else:
115
+ new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
116
+ return new_arr
117
+
118
+
119
+ @staticmethod
120
+ def sliding_window_future(arr, window_size = 5):
121
+ '''
122
+ Returns the future sliding window of the given array and window size
123
+ '''
124
+ arr = list(arr)
125
+ seq = []
126
+ for i in range(len(arr)):
127
+ p = arr[i+1:i+window_size+1]
128
+ if window_size - len(p) ==0:
129
+ seq.append(p)
130
+ else:
131
+ seq.append(p + [-1] * (window_size - len(p)))
132
+ return seq
133
+
134
+ @staticmethod
135
+ def insert_random_spaces(text, percent = .25):
136
+ '''
137
+ Returns the text with random spaces inserted
138
+ '''
139
+ l = list(text)
140
+ rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
141
+ print(rand_indices)
142
+ t = 1
143
+ for i in range(len(l)+1):
144
+ if i in rand_indices:
145
+ l.insert(i + t, ' ')
146
+ t+=1
147
+ new_txt = ''.join(l).strip()
148
+ return new_txt
149
+
150
+
151
+ @staticmethod
152
+ def prob_to_decision(a):
153
+ '''
154
+ Return I or K given probability vector
155
+ '''
156
+ if a[0] > a[1]:
157
+ return 'I'
158
+ else:
159
+ return 'K'
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tensorflow as tf
3
+ from SpaceGen_preprocessing import *
4
+ from utils import *
5
+
6
+
7
+ # Path to your Keras model
8
+ model_path = "SpaceGen_Large.keras"
9
+
10
+ # Load the model
11
+ model = tf.keras.models.load_model(model_path)
12
+
13
+
14
+ def fix_space(text):
15
+ text = clean_sentence(text)
16
+ X = text_to_X(text)
17
+ predictions = model.predict(X, verbose=0)
18
+ predicted_labels = []
19
+ for pred in predictions[0]:
20
+ predicted_labels.append(1 if pred[1] > .5 else 0)
21
+ fixed_text = insert_spaces(text.replace(' ',''), find_indices(predicted_labels))
22
+ return fixed_text
23
+
24
+ default_text = "T hel ittlegi rlra nthro ughth epa rkc has ing abut terfly."
25
+ demo = gr.Interface(fn=fix_space,
26
+ inputs=gr.Textbox(label="Input Text", value=default_text),
27
+ outputs="text")
28
+ demo.launch()
utils.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import string
4
+ import re
5
+ from SpaceGen_preprocessing import SpaceGen_preprocessing as sp
6
+
7
+ max_len = 853
8
+
9
+ def text_to_X(text):
10
+ test_text = text.replace(' ', '')
11
+ data = pd.DataFrame([test_text], columns=["correct_sentence"])
12
+ data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
13
+ data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
14
+ data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
15
+ data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
16
+ dec_dict = {'K': 0, 'I': 1}
17
+ data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
18
+ data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
19
+ lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
20
+
21
+ data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
22
+ data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
23
+ data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
24
+ data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
25
+ data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
26
+ data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
27
+ data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
28
+ X = np.stack(data.bytes_wrong_one_hot)
29
+ return X
30
+
31
+ def find_indices(lst):
32
+ indices = []
33
+ for idx, value in enumerate(lst):
34
+ if value == 1:
35
+ indices.append(idx)
36
+ return indices
37
+
38
+ def insert_spaces(text, indices):
39
+ result = []
40
+ for i, char in enumerate(text):
41
+ if i in indices:
42
+ result.append(" ")
43
+ result.append(char)
44
+ return "".join(result)
45
+
46
+
47
+ def clean_sentence(sentence):
48
+ pattern = r'[^A-Za-z#.\'!, ]'
49
+ return re.sub(pattern, '', sentence)
50
+
51
+ import numpy as np
52
+
53
+ def one_hot_encode(text):
54
+ # Define the vocabulary
55
+ vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
56
+ vocab_size = len(vocab)
57
+
58
+ # Create a mapping from character to index
59
+ char_to_index = {char: idx for idx, char in enumerate(vocab)}
60
+
61
+ # Initialize the one-hot encoded array
62
+ one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)
63
+
64
+ # Convert each character to one-hot encoded vector
65
+ for i, char in enumerate(text):
66
+ if char in char_to_index: # Ensure character is in the vocabulary
67
+ one_hot_encoded[i, char_to_index[char]] = 1
68
+ else:
69
+ raise ValueError(f"Character '{char}' not in vocabulary")
70
+
71
+ return one_hot_encoded