asafd60
/

SpaceGen

Keras

Model card Files Files and versions Community

asafd60 commited on Jun 20

Commit

c2f9838

•

1 Parent(s): 2938575

Upload 3 files

Browse files

Files changed (3) hide show

SpaceGen_preprocessing.py +159 -0
app.py +28 -0
utils.py +71 -0

SpaceGen_preprocessing.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+class SpaceGen_preprocessing:
+  def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
+    self.size = size
+    self.content = content[:self.size]
+    self.past_capacity = past_capacity
+    self.future_capacity = future_capacity
+    self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
+    self.vocabulary = []
+  def create_vocabulary(self, correct_txt):
+    '''
+    Returns the unique letters of the given text + '-1'
+    '''
+    vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
+    vocabulary.append(-1)
+    vocabulary = sorted(vocabulary)
+    self.vocabulary = vocabulary
+    return None
+  @staticmethod
+  def create_decision_vector(W: list, C: list):
+    '''
+    Returns the Decision Vector(D),
+    given Wrong Vector(W) and Correct Vector(C)
+    '''
+    D = []
+    w_i = 0
+    c_i = 0
+    while w_i < len(W):
+      if W[w_i] == C[c_i]:
+          D.append('K')
+          w_i += 1
+          c_i += 1
+      elif W[w_i] == 32 and C[c_i] != 32 :
+          D.append('D')
+          w_i += 1
+      elif C[c_i] == 32 and W[w_i] != 32:
+          D.append('I')
+          c_i += 1
+          w_i += 1
+      else:
+          c_i += 1
+    return D
+  @staticmethod
+  def to_correct(W, D):
+      '''
+      Returns the correct text,
+      given Wrong Vector(W) and Decision Vector(D)
+      '''
+      output_vec = []
+      for i in range(0, len(D)):
+        if D[i] == 'K':
+          output_vec.append(W[i])
+        elif D[i] == 'I':
+          output_vec.append(32)
+          output_vec.append(W[i])
+        elif D[i] == 'D':
+          pass
+      decoded_text = bytes(output_vec).decode()
+      return decoded_text
+  @staticmethod
+  def to_bytes_list(text: str, encoding = 'UTF-8'):
+      '''
+      Returns the bytes list of a given text
+      '''
+      return [b for b in bytes(text, encoding)]
+  @staticmethod
+  def to_one_hot_df(wrong_txt, D):
+    '''
+    Returns the one hot encoded dataframe,
+    given Wrong Vector(W) and Decision Vector(D)
+    '''
+    df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
+    encoding =  OneHotEncoder()
+    y_matrix =  encoding.fit_transform(df[['decision']])
+    onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
+    onehot_df = onehot_df.astype('int')
+    example_df = pd.concat([df, onehot_df], axis=1)
+    example_df =example_df.drop(['decision'], axis=1)
+    return example_df
+  @staticmethod
+  def decode_vec(arr):
+    '''
+    Returns the decoded text,
+    given the bytes list
+    '''
+    return bytes(arr).decode()
+  @staticmethod
+  def sliding_window_past(arr, window_size = 5):
+    '''
+    Returns the past sliding window of the given array and window size
+    '''
+    arr = list(arr)
+    new_arr = []
+    for i in range(len(arr)):
+      start_window = max(0, i- window_size)
+      tmp_seq = arr[start_window:i]
+      if window_size - len(tmp_seq) ==0:
+        new_arr.append(tmp_seq)
+      else:
+        new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
+    return new_arr
+  @staticmethod
+  def sliding_window_future(arr, window_size = 5):
+    '''
+    Returns the future sliding window of the given array and window size
+    '''
+    arr = list(arr)
+    seq = []
+    for i in range(len(arr)):
+      p = arr[i+1:i+window_size+1]
+      if window_size - len(p) ==0:
+        seq.append(p)
+      else:
+        seq.append(p + [-1] * (window_size - len(p)))
+    return seq
+  @staticmethod
+  def insert_random_spaces(text, percent = .25):
+    '''
+    Returns the text with random spaces inserted
+    '''
+    l = list(text)
+    rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
+    print(rand_indices)
+    t = 1
+    for i in range(len(l)+1):
+      if i in rand_indices:
+          l.insert(i + t, ' ')
+          t+=1
+    new_txt = ''.join(l).strip()
+    return new_txt
+  @staticmethod
+  def prob_to_decision(a):
+    '''
+    Return I or K given probability vector
+    '''
+    if a[0] > a[1]:
+      return 'I'
+    else:
+      return 'K'

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import gradio as gr
+import tensorflow as tf
+from SpaceGen_preprocessing import *
+from utils import *
+# Path to your Keras model
+model_path = "SpaceGen_Large.keras"
+# Load the model
+model = tf.keras.models.load_model(model_path)
+def fix_space(text):
+  text = clean_sentence(text)
+  X = text_to_X(text)
+  predictions = model.predict(X, verbose=0)
+  predicted_labels = []
+  for pred in predictions[0]:
+    predicted_labels.append(1 if pred[1] > .5 else 0)
+  fixed_text = insert_spaces(text.replace(' ',''), find_indices(predicted_labels))
+  return fixed_text
+default_text = "T hel ittlegi rlra nthro ughth epa rkc has ing abut terfly."
+demo = gr.Interface(fn=fix_space,
+                    inputs=gr.Textbox(label="Input Text", value=default_text),
+                    outputs="text")
+demo.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+import pandas as pd
+import string
+import re
+from SpaceGen_preprocessing import SpaceGen_preprocessing as sp
+max_len = 853
+def text_to_X(text):
+    test_text = text.replace(' ', '')
+    data = pd.DataFrame([test_text], columns=["correct_sentence"])
+    data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
+    data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
+    data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
+    data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
+    dec_dict = {'K': 0, 'I': 1}
+    data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
+    data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
+    lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
+    data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
+    data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
+    data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
+    data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
+    data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
+    data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
+    data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
+    X = np.stack(data.bytes_wrong_one_hot)
+    return X
+def find_indices(lst):
+    indices = []
+    for idx, value in enumerate(lst):
+        if value == 1:
+            indices.append(idx)
+    return indices
+def insert_spaces(text, indices):
+    result = []
+    for i, char in enumerate(text):
+        if i in indices:
+            result.append(" ")
+        result.append(char)
+    return "".join(result)
+def clean_sentence(sentence):
+  pattern = r'[^A-Za-z#.\'!, ]'
+  return re.sub(pattern, '', sentence)
+import numpy as np
+def one_hot_encode(text):
+    # Define the vocabulary
+    vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
+    vocab_size = len(vocab)
+    # Create a mapping from character to index
+    char_to_index = {char: idx for idx, char in enumerate(vocab)}
+    # Initialize the one-hot encoded array
+    one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)
+    # Convert each character to one-hot encoded vector
+    for i, char in enumerate(text):
+        if char in char_to_index:  # Ensure character is in the vocabulary
+            one_hot_encoded[i, char_to_index[char]] = 1
+        else:
+            raise ValueError(f"Character '{char}' not in vocabulary")
+    return one_hot_encoded