|
|
|
import numpy as np |
|
import pandas as pd |
|
import re |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.model_selection import train_test_split |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
|
|
def read_data(path): |
|
try: |
|
df = pd.read_csv(path) |
|
if df.empty: |
|
print("The file is empty.") |
|
return None |
|
return df |
|
except FileNotFoundError: |
|
print(f"File not found at: {path}") |
|
return None |
|
except Exception as e: |
|
print(f"An error occurred: {e}") |
|
return None |
|
|
|
|
|
def clean_text(text): |
|
text = text.lower() |
|
text = re.sub(r"\d+", " ", text) |
|
text = re.sub(r"[^\w\s]", " ", text) |
|
text = text.strip() |
|
return text |
|
|
|
|
|
def preprocess_data(file_path, max_len=10, vocab_size=250): |
|
|
|
df = read_data(file_path) |
|
if df is None: |
|
print("Data loading failed.") |
|
return None, None, None, None |
|
|
|
|
|
df['Transaction Description'] = df['Transaction Description'].apply(clean_text) |
|
|
|
|
|
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>") |
|
tokenizer.fit_on_texts(df['Transaction Description']) |
|
|
|
|
|
sequences = tokenizer.texts_to_sequences(df['Transaction Description']) |
|
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post') |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
labels = label_encoder.fit_transform(df['Category']) |
|
|
|
return padded_sequences, labels, tokenizer, label_encoder |
|
|
|
|
|
def split_data(sequences, labels, test_size=0.2, random_state=42): |
|
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state) |
|
return X_train, X_test, y_train, y_test |
|
|
|
|
|
def main(): |
|
|
|
data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv" |
|
|
|
|
|
sequences, labels, tokenizer, label_encoder = preprocess_data(data_path) |
|
|
|
|
|
if sequences is not None: |
|
print("Data preprocessing successful!") |
|
|
|
X_train, X_test, y_train, y_test = split_data(sequences, labels) |
|
print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}") |
|
print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}") |
|
else: |
|
print("Data preprocessing failed.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|