|
import numpy as np
|
|
import pandas as pd
|
|
import string
|
|
import re
|
|
from SpaceGen_preprocessing import SpaceGen_preprocessing as sp
|
|
|
|
max_len = 853
|
|
|
|
def text_to_X(text):
|
|
test_text = text.replace(' ', '')
|
|
data = pd.DataFrame([test_text], columns=["correct_sentence"])
|
|
data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
|
|
data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
|
|
data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
|
|
data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
|
|
dec_dict = {'K': 0, 'I': 1}
|
|
data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
|
|
data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
|
|
lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
|
|
|
|
data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
|
|
data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
|
|
data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
|
|
data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
|
|
data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
|
|
data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
|
|
data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
|
|
X = np.stack(data.bytes_wrong_one_hot)
|
|
return X
|
|
|
|
def find_indices(lst):
|
|
indices = []
|
|
for idx, value in enumerate(lst):
|
|
if value == 1:
|
|
indices.append(idx)
|
|
return indices
|
|
|
|
def insert_spaces(text, indices):
|
|
result = []
|
|
for i, char in enumerate(text):
|
|
if i in indices:
|
|
result.append(" ")
|
|
result.append(char)
|
|
return "".join(result)
|
|
|
|
|
|
def clean_sentence(sentence):
|
|
pattern = r'[^A-Za-z#.\'!, ]'
|
|
return re.sub(pattern, '', sentence)
|
|
|
|
import numpy as np
|
|
|
|
def one_hot_encode(text):
|
|
|
|
vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
|
|
vocab_size = len(vocab)
|
|
|
|
|
|
char_to_index = {char: idx for idx, char in enumerate(vocab)}
|
|
|
|
|
|
one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)
|
|
|
|
|
|
for i, char in enumerate(text):
|
|
if char in char_to_index:
|
|
one_hot_encoded[i, char_to_index[char]] = 1
|
|
else:
|
|
raise ValueError(f"Character '{char}' not in vocabulary")
|
|
|
|
return one_hot_encoded |