Spaces:

asafd60
/

SpaceGen

Sleeping

App Files Files Community

asafd60 commited on Sep 13

Commit

b7937ac

verified ·

1 Parent(s): 36bb0a8

Upload 15 files

Browse files

Files changed (16) hide show

.gitattributes +3 -0
SpaceGen/SpaceGen_Large.keras +3 -0
SpaceGen/__init__.py +2 -0
SpaceGen/model.pth +3 -0
SpaceGen/model.py +36 -0
SpaceGen/preprocessor.py +159 -0
SpaceGen/utils.py +72 -0
app.py +28 -0
images/logo.png +0 -0
images/old.jpg +3 -0
images/retro-space.jpg +3 -0
main.html +25 -0
requirements.txt +9 -0
runtime.txt +1 -0
script.js +21 -0
style98.css +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/old.jpg filter=lfs diff=lfs merge=lfs -text
+images/retro-space.jpg filter=lfs diff=lfs merge=lfs -text
+SpaceGen/SpaceGen_Large.keras filter=lfs diff=lfs merge=lfs -text

SpaceGen/SpaceGen_Large.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:094bdd0ba837d99e33de2e8f03bf1d387e2102f57f95ff50e1ba6bd4f325e4cb
+size 27206663

SpaceGen/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .preprocessor import Preprocessor
2	+ from .utils import *

SpaceGen/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:163e5fec68f52254f5c0b7b3001058c5672a6290f030b134b5d5b41266e6bfa9
+size 9091744

SpaceGen/model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import tensorflow as tf
+from .preprocessor import *
+from .utils import *
+# Path to your Keras model
+MODEL_PATH = "SpaceGen/SpaceGen_Large.keras"
+# Compatibility shim: accept and ignore deprecated/unknown 'time_major'
+class LSTMCompat(tf.keras.layers.LSTM):
+    def __init__(self, *args, time_major=None, **kwargs):
+        super().__init__(*args, **kwargs)
+def _load_model(path: str):
+    """Load a Keras model, tolerating legacy 'time_major' in LSTM configs."""
+    try:
+        return tf.keras.models.load_model(path)
+    except ValueError as e:
+        if "time_major" in str(e):
+            return tf.keras.models.load_model(path, custom_objects={"LSTM": LSTMCompat})
+        raise
+class SpaceGenModel:
+    def __init__(self, model_path: str = MODEL_PATH):
+        self.model = _load_model(model_path)
+    def fix_space(self, text: str) -> str:
+        text = clean_sentence(text)
+        X = text_to_X(text)
+        predictions = self.model.predict(X, verbose=0)
+        predicted_labels = [1 if pred[1] > 0.5 else 0 for pred in predictions[0]]
+        fixed_text = insert_spaces(text.replace(" ", ""), find_indices(predicted_labels))
+        return fixed_text

SpaceGen/preprocessor.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+class Preprocessor:
+  def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
+    self.size = size
+    self.content = content[:self.size]
+    self.past_capacity = past_capacity
+    self.future_capacity = future_capacity
+    self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
+    self.vocabulary = []
+  def create_vocabulary(self, correct_txt):
+    '''
+    Returns the unique letters of the given text + '-1'
+    '''
+    vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
+    vocabulary.append(-1)
+    vocabulary = sorted(vocabulary)
+    self.vocabulary = vocabulary
+    return None
+  @staticmethod
+  def create_decision_vector(W: list, C: list):
+    '''
+    Returns the Decision Vector(D),
+    given Wrong Vector(W) and Correct Vector(C)
+    '''
+    D = []
+    w_i = 0
+    c_i = 0
+    while w_i < len(W):
+      if W[w_i] == C[c_i]:
+          D.append('K')
+          w_i += 1
+          c_i += 1
+      elif W[w_i] == 32 and C[c_i] != 32 :
+          D.append('D')
+          w_i += 1
+      elif C[c_i] == 32 and W[w_i] != 32:
+          D.append('I')
+          c_i += 1
+          w_i += 1
+      else:
+          c_i += 1
+    return D
+  @staticmethod
+  def to_correct(W, D):
+      '''
+      Returns the correct text,
+      given Wrong Vector(W) and Decision Vector(D)
+      '''
+      output_vec = []
+      for i in range(0, len(D)):
+        if D[i] == 'K':
+          output_vec.append(W[i])
+        elif D[i] == 'I':
+          output_vec.append(32)
+          output_vec.append(W[i])
+        elif D[i] == 'D':
+          pass
+      decoded_text = bytes(output_vec).decode()
+      return decoded_text
+  @staticmethod
+  def to_bytes_list(text: str, encoding = 'UTF-8'):
+      '''
+      Returns the bytes list of a given text
+      '''
+      return [b for b in bytes(text, encoding)]
+  @staticmethod
+  def to_one_hot_df(wrong_txt, D):
+    '''
+    Returns the one hot encoded dataframe,
+    given Wrong Vector(W) and Decision Vector(D)
+    '''
+    df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
+    encoding =  OneHotEncoder()
+    y_matrix =  encoding.fit_transform(df[['decision']])
+    onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
+    onehot_df = onehot_df.astype('int')
+    example_df = pd.concat([df, onehot_df], axis=1)
+    example_df =example_df.drop(['decision'], axis=1)
+    return example_df
+  @staticmethod
+  def decode_vec(arr):
+    '''
+    Returns the decoded text,
+    given the bytes list
+    '''
+    return bytes(arr).decode()
+  @staticmethod
+  def sliding_window_past(arr, window_size = 5):
+    '''
+    Returns the past sliding window of the given array and window size
+    '''
+    arr = list(arr)
+    new_arr = []
+    for i in range(len(arr)):
+      start_window = max(0, i- window_size)
+      tmp_seq = arr[start_window:i]
+      if window_size - len(tmp_seq) ==0:
+        new_arr.append(tmp_seq)
+      else:
+        new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
+    return new_arr
+  @staticmethod
+  def sliding_window_future(arr, window_size = 5):
+    '''
+    Returns the future sliding window of the given array and window size
+    '''
+    arr = list(arr)
+    seq = []
+    for i in range(len(arr)):
+      p = arr[i+1:i+window_size+1]
+      if window_size - len(p) ==0:
+        seq.append(p)
+      else:
+        seq.append(p + [-1] * (window_size - len(p)))
+    return seq
+  @staticmethod
+  def insert_random_spaces(text, percent = .25):
+    '''
+    Returns the text with random spaces inserted
+    '''
+    l = list(text)
+    rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
+    print(rand_indices)
+    t = 1
+    for i in range(len(l)+1):
+      if i in rand_indices:
+          l.insert(i + t, ' ')
+          t+=1
+    new_txt = ''.join(l).strip()
+    return new_txt
+  @staticmethod
+  def prob_to_decision(a):
+    '''
+    Return I or K given probability vector
+    '''
+    if a[0] > a[1]:
+      return 'I'
+    else:
+      return 'K'

SpaceGen/utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import numpy as np
+import pandas as pd
+import string
+import re
+from .preprocessor import Preprocessor as sp
+max_len = 853
+def text_to_X(text):
+    test_text = text.replace(' ', '')
+    data = pd.DataFrame([test_text], columns=["correct_sentence"])
+    data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
+    data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
+    data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
+    data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
+    dec_dict = {'K': 0, 'I': 1}
+    data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
+    data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
+    lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
+    data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
+    data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
+    data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
+    data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
+    data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
+    data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
+    data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
+    X = np.stack(data.bytes_wrong_one_hot)
+    return X
+def find_indices(lst):
+    indices = []
+    for idx, value in enumerate(lst):
+        if value == 1:
+            indices.append(idx)
+    return indices
+def insert_spaces(text, indices):
+    result = []
+    for i, char in enumerate(text):
+        if i in indices:
+            result.append(" ")
+        result.append(char)
+    return "".join(result)
+def clean_sentence(sentence):
+  pattern = r'[^A-Za-z#.\'!, ]'
+  return re.sub(pattern, '', sentence)
+import numpy as np
+def one_hot_encode(text):
+    # Define the vocabulary
+    vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
+    vocab_size = len(vocab)
+    # Create a mapping from character to index
+    char_to_index = {char: idx for idx, char in enumerate(vocab)}
+    # Initialize the one-hot encoded array
+    one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)
+    # Convert each character to one-hot encoded vector
+    for i, char in enumerate(text):
+        if char in char_to_index:  # Ensure character is in the vocabulary
+            one_hot_encoded[i, char_to_index[char]] = 1
+        else:
+            raise ValueError(f"Character '{char}' not in vocabulary")
+    return one_hot_encoded

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from flask import Flask, request, jsonify, send_from_directory
+from SpaceGen.model import SpaceGenModel
+from flask_cors import CORS
+app = Flask(__name__, static_folder='.', static_url_path='')
+CORS(app)
+MODEL_PATH = "SpaceGen/SpaceGen_Large.keras"
+model = SpaceGenModel(model_path=MODEL_PATH)
+@app.route('/api/data', methods=['POST'])
+def space_text():
+    if request.is_json:
+        data = request.get_json()
+        corrupted_text = data.get('corrupted_text')
+        spaced_text = model.fix_space(corrupted_text)
+        return jsonify({'spaced_text': spaced_text})
+@app.get('/')
+def index():
+    return send_from_directory('.', 'main.html')
+import os
+if __name__ == '__main__':
+    port = int(os.environ.get("PORT", 5000))
+    app.run(host="0.0.0.0", port=port, debug=False)

images/logo.png ADDED Viewed

images/old.jpg ADDED Viewed

Git LFS Details

SHA256: fa0ad8cdf892577fb8ccb69a32f0bfa33771a0d7ebb00a760a8f34134612ac23
Pointer size: 131 Bytes
Size of remote file: 190 kB

images/retro-space.jpg ADDED Viewed

Git LFS Details

SHA256: 72d0e520db3fd1c070359cfd07b1c4a0ed9d022541289871fe10b2cb9200f53c
Pointer size: 131 Bytes
Size of remote file: 414 kB

main.html ADDED Viewed

	@@ -0,0 +1,25 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>SpaceGen</title>
+    <link rel="stylesheet" type="text/css" href="style98.css">
+    <script src="script.js" defer></script>
+</head>
+<body>
+    <h1>SpaceGen</h1>
+    <h6>Don't mess with the SpaceGen</h6>
+    <div class="button-container">
+        <button id="aboutButton">About</button>
+    </div>
+    <div class="input-container">
+        <textarea id="userInput" rows="12" cols="50" placeholder="Enter corrupted text here">T hel ittlegi rlra nthro ughth epa rkc has ing abut terfly.</textarea>
+    </div>
+    <div class="button-container">
+        <button id="generateButton">Fix Space</button>
+    </div>
+    <div class="output-container">
+        <textarea id="outputText" rows="12" cols="50" placeholder="Gently spaced text will be outputted here..." readonly></textarea>
+    </div>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+setuptools>=42
+wheel
+fastapi==0.116.1
+Flask==3.1.2
+flask-cors==6.0.1
+scikit-learn==1.6.1
+tensorflow==2.15.0
+pandas
+numpy

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.9.18

script.js ADDED Viewed

	@@ -0,0 +1,21 @@

+async function space_text(){
+    let corrupted_text = document.getElementById('userInput').value;
+    const response = await fetch('/api/data', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify({ corrupted_text: corrupted_text })
+    });
+    const data = await response.json();
+    let result = document.getElementById('outputText');
+    result.value = data.spaced_text;
+}
+let btn = document.getElementById('generateButton').addEventListener('click', space_text);
+document.getElementById("aboutButton").addEventListener("click", () => {
+    const aboutText = `S pacege nis ana cad emicpro jec t d e       vel oped b y As af De lme di g o an d R omi Zar chid uri ng the irgrad uates tudiesin N eu ro sc ience  and  D ata Sci ence. I t d emons trat es the use of a Long Short-Term Memory arti ficial ne ural net work for the au toma tic det ection an d corre ction of miss ing and mis placed sp aces i n t ext.`;
+    document.getElementById("userInput").value = aboutText;
+}); s

style98.css ADDED Viewed

	@@ -0,0 +1,45 @@

+body {
+    background-image: url('images/retro-space.jpg');
+    background-size: cover;
+    background-repeat: no-repeat;
+    font-family: Arial, sans-serif;
+    color: white;
+    text-align: center;
+}
+h1 {
+    margin-bottom: 1px;
+    font-family: 'Courier New', Courier, monospace;
+    font-size: 42px;
+    padding-top: 1px;
+}
+h6 {
+    margin-bottom: 90px;
+    font-family: 'Courier New', Courier, monospace;
+    font-size: 14px;
+}
+.button-container {
+    margin-bottom: 35px;
+}
+button {
+    background-color: rgba(47, 28, 113, 0.5);
+    color: white;
+    font-size: 14px;
+}
+textarea {
+    padding-top: 5px;
+    margin-bottom: 25px;
+    font-size: 14px;
+    border-radius: 5px;
+    border: 1px solid #ccc;
+    background-color: rgba(0, 0, 0, 0.5);
+    color: white;
+    width: 100%;
+    max-width: 500px; /* Limit maximum width if needed */
+    box-sizing: border-box; /* Ensures padding does not affect width */
+    resize: vertical; /* Allows user to resize vertically */
+}