asafd60
/

SpaceGen

Model card Files Files and versions Community

SpaceGen / utils.py

asafd60's picture

Upload 3 files

c2f9838 verified 4 months ago

3.09 kB

	import numpy as np
	import pandas as pd
	import string
	import re
	from SpaceGen_preprocessing import SpaceGen_preprocessing as sp

	max_len = 853

	def text_to_X(text):
	test_text = text.replace(' ', '')
	data = pd.DataFrame([test_text], columns=["correct_sentence"])
	data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
	data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
	data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
	data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
	dec_dict = {'K': 0, 'I': 1}
	data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
	data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
	lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]

	data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
	data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
	data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
	data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
	data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
	data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
	data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))
	X = np.stack(data.bytes_wrong_one_hot)
	return X

	def find_indices(lst):
	indices = []
	for idx, value in enumerate(lst):
	if value == 1:
	indices.append(idx)
	return indices

	def insert_spaces(text, indices):
	result = []
	for i, char in enumerate(text):
	if i in indices:
	result.append(" ")
	result.append(char)
	return "".join(result)


	def clean_sentence(sentence):
	pattern = r'[^A-Za-z#.\'!, ]'
	return re.sub(pattern, '', sentence)

	import numpy as np

	def one_hot_encode(text):
	# Define the vocabulary
	vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,')
	vocab_size = len(vocab)

	# Create a mapping from character to index
	char_to_index = {char: idx for idx, char in enumerate(vocab)}

	# Initialize the one-hot encoded array
	one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)

	# Convert each character to one-hot encoded vector
	for i, char in enumerate(text):
	if char in char_to_index: # Ensure character is in the vocabulary
	one_hot_encoded[i, char_to_index[char]] = 1
	else:
	raise ValueError(f"Character '{char}' not in vocabulary")

	return one_hot_encoded