asafd60 commited on
Commit
b7937ac
·
verified ·
1 Parent(s): 36bb0a8

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/old.jpg filter=lfs diff=lfs merge=lfs -text
37
+ images/retro-space.jpg filter=lfs diff=lfs merge=lfs -text
38
+ SpaceGen/SpaceGen_Large.keras filter=lfs diff=lfs merge=lfs -text
SpaceGen/SpaceGen_Large.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:094bdd0ba837d99e33de2e8f03bf1d387e2102f57f95ff50e1ba6bd4f325e4cb
3
+ size 27206663
SpaceGen/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .preprocessor import Preprocessor
2
+ from .utils import *
SpaceGen/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:163e5fec68f52254f5c0b7b3001058c5672a6290f030b134b5d5b41266e6bfa9
3
+ size 9091744
SpaceGen/model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from .preprocessor import *
3
+ from .utils import *
4
+
5
+
6
+ # Path to your Keras model
7
+ MODEL_PATH = "SpaceGen/SpaceGen_Large.keras"
8
+
9
+
10
+ # Compatibility shim: accept and ignore deprecated/unknown 'time_major'
11
+ class LSTMCompat(tf.keras.layers.LSTM):
12
+ def __init__(self, *args, time_major=None, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+
15
+
16
+ def _load_model(path: str):
17
+ """Load a Keras model, tolerating legacy 'time_major' in LSTM configs."""
18
+ try:
19
+ return tf.keras.models.load_model(path)
20
+ except ValueError as e:
21
+ if "time_major" in str(e):
22
+ return tf.keras.models.load_model(path, custom_objects={"LSTM": LSTMCompat})
23
+ raise
24
+
25
+
26
+ class SpaceGenModel:
27
+ def __init__(self, model_path: str = MODEL_PATH):
28
+ self.model = _load_model(model_path)
29
+
30
+ def fix_space(self, text: str) -> str:
31
+ text = clean_sentence(text)
32
+ X = text_to_X(text)
33
+ predictions = self.model.predict(X, verbose=0)
34
+ predicted_labels = [1 if pred[1] > 0.5 else 0 for pred in predictions[0]]
35
+ fixed_text = insert_spaces(text.replace(" ", ""), find_indices(predicted_labels))
36
+ return fixed_text
SpaceGen/preprocessor.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import OneHotEncoder
4
+
5
+ class Preprocessor:
6
+ def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
7
+ self.size = size
8
+ self.content = content[:self.size]
9
+ self.past_capacity = past_capacity
10
+ self.future_capacity = future_capacity
11
+ self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
12
+ self.vocabulary = []
13
+
14
+ def create_vocabulary(self, correct_txt):
15
+ '''
16
+ Returns the unique letters of the given text + '-1'
17
+ '''
18
+ vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
19
+ vocabulary.append(-1)
20
+ vocabulary = sorted(vocabulary)
21
+ self.vocabulary = vocabulary
22
+ return None
23
+
24
+ @staticmethod
25
+ def create_decision_vector(W: list, C: list):
26
+ '''
27
+ Returns the Decision Vector(D),
28
+ given Wrong Vector(W) and Correct Vector(C)
29
+ '''
30
+ D = []
31
+ w_i = 0
32
+ c_i = 0
33
+ while w_i < len(W):
34
+ if W[w_i] == C[c_i]:
35
+ D.append('K')
36
+ w_i += 1
37
+ c_i += 1
38
+ elif W[w_i] == 32 and C[c_i] != 32 :
39
+ D.append('D')
40
+ w_i += 1
41
+ elif C[c_i] == 32 and W[w_i] != 32:
42
+ D.append('I')
43
+ c_i += 1
44
+ w_i += 1
45
+ else:
46
+ c_i += 1
47
+ return D
48
+
49
+
50
+ @staticmethod
51
+ def to_correct(W, D):
52
+ '''
53
+ Returns the correct text,
54
+ given Wrong Vector(W) and Decision Vector(D)
55
+ '''
56
+ output_vec = []
57
+ for i in range(0, len(D)):
58
+ if D[i] == 'K':
59
+ output_vec.append(W[i])
60
+ elif D[i] == 'I':
61
+ output_vec.append(32)
62
+ output_vec.append(W[i])
63
+ elif D[i] == 'D':
64
+ pass
65
+ decoded_text = bytes(output_vec).decode()
66
+ return decoded_text
67
+
68
+
69
+ @staticmethod
70
+ def to_bytes_list(text: str, encoding = 'UTF-8'):
71
+ '''
72
+ Returns the bytes list of a given text
73
+ '''
74
+ return [b for b in bytes(text, encoding)]
75
+
76
+
77
+ @staticmethod
78
+ def to_one_hot_df(wrong_txt, D):
79
+ '''
80
+ Returns the one hot encoded dataframe,
81
+ given Wrong Vector(W) and Decision Vector(D)
82
+ '''
83
+ df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
84
+ encoding = OneHotEncoder()
85
+ y_matrix = encoding.fit_transform(df[['decision']])
86
+ onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
87
+ onehot_df = onehot_df.astype('int')
88
+ example_df = pd.concat([df, onehot_df], axis=1)
89
+ example_df =example_df.drop(['decision'], axis=1)
90
+ return example_df
91
+
92
+
93
+ @staticmethod
94
+ def decode_vec(arr):
95
+ '''
96
+ Returns the decoded text,
97
+ given the bytes list
98
+ '''
99
+ return bytes(arr).decode()
100
+
101
+
102
+ @staticmethod
103
+ def sliding_window_past(arr, window_size = 5):
104
+ '''
105
+ Returns the past sliding window of the given array and window size
106
+ '''
107
+ arr = list(arr)
108
+ new_arr = []
109
+ for i in range(len(arr)):
110
+ start_window = max(0, i- window_size)
111
+ tmp_seq = arr[start_window:i]
112
+ if window_size - len(tmp_seq) ==0:
113
+ new_arr.append(tmp_seq)
114
+ else:
115
+ new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
116
+ return new_arr
117
+
118
+
119
+ @staticmethod
120
+ def sliding_window_future(arr, window_size = 5):
121
+ '''
122
+ Returns the future sliding window of the given array and window size
123
+ '''
124
+ arr = list(arr)
125
+ seq = []
126
+ for i in range(len(arr)):
127
+ p = arr[i+1:i+window_size+1]
128
+ if window_size - len(p) ==0:
129
+ seq.append(p)
130
+ else:
131
+ seq.append(p + [-1] * (window_size - len(p)))
132
+ return seq
133
+
134
+ @staticmethod
135
+ def insert_random_spaces(text, percent = .25):
136
+ '''
137
+ Returns the text with random spaces inserted
138
+ '''
139
+ l = list(text)
140
+ rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
141
+ print(rand_indices)
142
+ t = 1
143
+ for i in range(len(l)+1):
144
+ if i in rand_indices:
145
+ l.insert(i + t, ' ')
146
+ t+=1
147
+ new_txt = ''.join(l).strip()
148
+ return new_txt
149
+
150
+
151
+ @staticmethod
152
+ def prob_to_decision(a):
153
+ '''
154
+ Return I or K given probability vector
155
+ '''
156
+ if a[0] > a[1]:
157
+ return 'I'
158
+ else:
159
+ return 'K'
SpaceGen/utils.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import string
4
+ import re
5
+ from .preprocessor import Preprocessor as sp
6
+
7
+ max_len = 853
8
+
9
+ def text_to_X(text):
10
+ test_text = text.replace(' ', '')
11
+ data = pd.DataFrame([test_text], columns=["correct_sentence"])
12
+ data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
13
+ data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
14
+ data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
15
+ data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
16
+ dec_dict = {'K': 0, 'I': 1}
17
+ data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
18
+ data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
19
+ lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
20
+
21
+ data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
22
+ data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
23
+ data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
24
+ data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
25
+ data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
26
+ data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
27
+ data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
28
+ X = np.stack(data.bytes_wrong_one_hot)
29
+ return X
30
+
31
+ def find_indices(lst):
32
+ indices = []
33
+ for idx, value in enumerate(lst):
34
+ if value == 1:
35
+ indices.append(idx)
36
+ return indices
37
+
38
+ def insert_spaces(text, indices):
39
+ result = []
40
+ for i, char in enumerate(text):
41
+ if i in indices:
42
+ result.append(" ")
43
+ result.append(char)
44
+ return "".join(result)
45
+
46
+
47
+ def clean_sentence(sentence):
48
+ pattern = r'[^A-Za-z#.\'!, ]'
49
+ return re.sub(pattern, '', sentence)
50
+
51
+ import numpy as np
52
+
53
+ def one_hot_encode(text):
54
+ # Define the vocabulary
55
+ vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
56
+
57
+ vocab_size = len(vocab)
58
+
59
+ # Create a mapping from character to index
60
+ char_to_index = {char: idx for idx, char in enumerate(vocab)}
61
+
62
+ # Initialize the one-hot encoded array
63
+ one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)
64
+
65
+ # Convert each character to one-hot encoded vector
66
+ for i, char in enumerate(text):
67
+ if char in char_to_index: # Ensure character is in the vocabulary
68
+ one_hot_encoded[i, char_to_index[char]] = 1
69
+ else:
70
+ raise ValueError(f"Character '{char}' not in vocabulary")
71
+
72
+ return one_hot_encoded
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_from_directory
2
+ from SpaceGen.model import SpaceGenModel
3
+ from flask_cors import CORS
4
+
5
+ app = Flask(__name__, static_folder='.', static_url_path='')
6
+ CORS(app)
7
+ MODEL_PATH = "SpaceGen/SpaceGen_Large.keras"
8
+
9
+
10
+
11
+ model = SpaceGenModel(model_path=MODEL_PATH)
12
+ @app.route('/api/data', methods=['POST'])
13
+ def space_text():
14
+ if request.is_json:
15
+ data = request.get_json()
16
+ corrupted_text = data.get('corrupted_text')
17
+ spaced_text = model.fix_space(corrupted_text)
18
+ return jsonify({'spaced_text': spaced_text})
19
+
20
+ @app.get('/')
21
+ def index():
22
+ return send_from_directory('.', 'main.html')
23
+
24
+ import os
25
+
26
+ if __name__ == '__main__':
27
+ port = int(os.environ.get("PORT", 5000))
28
+ app.run(host="0.0.0.0", port=port, debug=False)
images/logo.png ADDED
images/old.jpg ADDED

Git LFS Details

  • SHA256: fa0ad8cdf892577fb8ccb69a32f0bfa33771a0d7ebb00a760a8f34134612ac23
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
images/retro-space.jpg ADDED

Git LFS Details

  • SHA256: 72d0e520db3fd1c070359cfd07b1c4a0ed9d022541289871fe10b2cb9200f53c
  • Pointer size: 131 Bytes
  • Size of remote file: 414 kB
main.html ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>SpaceGen</title>
5
+ <link rel="stylesheet" type="text/css" href="style98.css">
6
+ <script src="script.js" defer></script>
7
+ </head>
8
+ <body>
9
+ <h1>SpaceGen</h1>
10
+ <h6>Don't mess with the SpaceGen</h6>
11
+ <div class="button-container">
12
+ <button id="aboutButton">About</button>
13
+ </div>
14
+ <div class="input-container">
15
+ <textarea id="userInput" rows="12" cols="50" placeholder="Enter corrupted text here">T hel ittlegi rlra nthro ughth epa rkc has ing abut terfly.</textarea>
16
+ </div>
17
+ <div class="button-container">
18
+ <button id="generateButton">Fix Space</button>
19
+ </div>
20
+ <div class="output-container">
21
+ <textarea id="outputText" rows="12" cols="50" placeholder="Gently spaced text will be outputted here..." readonly></textarea>
22
+ </div>
23
+
24
+ </body>
25
+ </html>
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ setuptools>=42
2
+ wheel
3
+ fastapi==0.116.1
4
+ Flask==3.1.2
5
+ flask-cors==6.0.1
6
+ scikit-learn==1.6.1
7
+ tensorflow==2.15.0
8
+ pandas
9
+ numpy
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.9.18
script.js ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ async function space_text(){
2
+ let corrupted_text = document.getElementById('userInput').value;
3
+ const response = await fetch('/api/data', {
4
+ method: 'POST',
5
+ headers: {
6
+ 'Content-Type': 'application/json'
7
+ },
8
+ body: JSON.stringify({ corrupted_text: corrupted_text })
9
+ });
10
+
11
+ const data = await response.json();
12
+ let result = document.getElementById('outputText');
13
+ result.value = data.spaced_text;
14
+ }
15
+
16
+ let btn = document.getElementById('generateButton').addEventListener('click', space_text);
17
+
18
+ document.getElementById("aboutButton").addEventListener("click", () => {
19
+ const aboutText = `S pacege nis ana cad emicpro jec t d e vel oped b y As af De lme di g o an d R omi Zar chid uri ng the irgrad uates tudiesin N eu ro sc ience and D ata Sci ence. I t d emons trat es the use of a Long Short-Term Memory arti ficial ne ural net work for the au toma tic det ection an d corre ction of miss ing and mis placed sp aces i n t ext.`;
20
+ document.getElementById("userInput").value = aboutText;
21
+ }); s
style98.css ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ background-image: url('images/retro-space.jpg');
3
+ background-size: cover;
4
+ background-repeat: no-repeat;
5
+ font-family: Arial, sans-serif;
6
+ color: white;
7
+ text-align: center;
8
+ }
9
+
10
+ h1 {
11
+ margin-bottom: 1px;
12
+ font-family: 'Courier New', Courier, monospace;
13
+ font-size: 42px;
14
+ padding-top: 1px;
15
+ }
16
+
17
+ h6 {
18
+ margin-bottom: 90px;
19
+ font-family: 'Courier New', Courier, monospace;
20
+ font-size: 14px;
21
+ }
22
+
23
+ .button-container {
24
+ margin-bottom: 35px;
25
+ }
26
+
27
+ button {
28
+ background-color: rgba(47, 28, 113, 0.5);
29
+ color: white;
30
+ font-size: 14px;
31
+ }
32
+
33
+ textarea {
34
+ padding-top: 5px;
35
+ margin-bottom: 25px;
36
+ font-size: 14px;
37
+ border-radius: 5px;
38
+ border: 1px solid #ccc;
39
+ background-color: rgba(0, 0, 0, 0.5);
40
+ color: white;
41
+ width: 100%;
42
+ max-width: 500px; /* Limit maximum width if needed */
43
+ box-sizing: border-box; /* Ensures padding does not affect width */
44
+ resize: vertical; /* Allows user to resize vertically */
45
+ }