webslate
/

transactify

Model card Files Files and versions Community

transactify / data_preprocessing.py

ai-venkat-r's picture

adding & rename the files (#11)

cd72b8d verified 9 months ago

history blame contribute delete

3.01 kB

	# data_preprocessing.py
	import numpy as np
	import pandas as pd
	import re
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences

	# Read the data
	def read_data(path):
	try:
	df = pd.read_csv(path)
	if df.empty:
	print("The file is empty.")
	return None
	return df
	except FileNotFoundError:
	print(f"File not found at: {path}")
	return None
	except Exception as e:
	print(f"An error occurred: {e}")
	return None

	# Cleaning the text
	def clean_text(text):
	text = text.lower() # Convert uppercase to lowercase
	text = re.sub(r"\d+", " ", text) # Remove digits
	text = re.sub(r"[^\w\s]", " ", text) # Remove punctuations
	text = text.strip() # Remove extra spaces
	return text

	# Main preprocessing function
	def preprocess_data(file_path, max_len=10, vocab_size=250):
	# Read the data
	df = read_data(file_path)
	if df is None:
	print("Data loading failed.")
	return None, None, None, None

	# Clean the text
	df['Transaction Description'] = df['Transaction Description'].apply(clean_text)

	# Initialize the tokenizer
	tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
	tokenizer.fit_on_texts(df['Transaction Description'])

	# Convert texts to sequences and pad them
	sequences = tokenizer.texts_to_sequences(df['Transaction Description'])
	padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

	# Initialize LabelEncoder and encode the labels
	label_encoder = LabelEncoder()
	labels = label_encoder.fit_transform(df['Category'])

	return padded_sequences, labels, tokenizer, label_encoder

	# Train-test split function
	def split_data(sequences, labels, test_size=0.2, random_state=42):
	X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state)
	return X_train, X_test, y_train, y_test

	# Main function to execute preprocessing
	def main():
	# Path to your data file
	data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"

	# Preprocess the data
	sequences, labels, tokenizer, label_encoder = preprocess_data(data_path)

	# Check if preprocessing succeeded
	if sequences is not None:
	print("Data preprocessing successful!")
	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = split_data(sequences, labels)
	print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
	print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}")
	else:
	print("Data preprocessing failed.")

	# Execute the main function
	if __name__ == "__main__":
	main()