#the goal of this script is to train the model and then save it import os import tensorflow as tf from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, Flatten, Dense, Dropout from tensorflow.keras.optimizers import Adam from tensorflow.keras.models import load_model from tensorflow.keras.preprocessing.image import load_img, img_to_array import shutil from PIL import Image from tensorflow.keras.preprocessing.image import load_img, img_to_array import matplotlib.pyplot as plt import cv2 #import seaborn as sns import numpy as np import pickle def clean_directory(directory, cache_file="cache.pkl"): if os.path.exists(cache_file): with open(cache_file, "rb") as f: num_classes = pickle.load(f) print("Loaded cached results.") return num_classes num_classes = 0 for subdir, dirs, files in os.walk(directory): if not dirs: num_classes += 1 valid_files = [] for file in files: file_path = os.path.join(subdir, file) try: img = Image.open(file_path) img.verify() # Verify if the image is not corrupted valid_files.append(file) except (IOError, SyntaxError) as e: print(f"Removing corrupted file: {file_path}") os.remove(file_path) # Remove empty directories if not valid_files: print(f"Removing empty directory: {subdir}") shutil.rmtree(subdir) num_classes -= 1 # Save the results in cache with open(cache_file, "wb") as f: pickle.dump(num_classes, f) print("Saved results to cache.") return num_classes data_dir = 'Malign/extract' num_classes = clean_directory(data_dir) # Parameters batch_size = 32 epochs = 50 image_size = (200, 200) # Set the desired image size for input to the model # Data preprocessing train_datagen = ImageDataGenerator( rescale=1./255, validation_split=0.2 # Split 20% of data for validation ) train_generator = train_datagen.flow_from_directory( data_dir, target_size=image_size, batch_size=batch_size, class_mode='categorical', subset='training' ) validation_generator = train_datagen.flow_from_directory( data_dir, target_size=image_size, batch_size=batch_size, class_mode='categorical', subset='validation' ) # Model creation model = Sequential() # First convolution layer model.add(Conv2D(64, (3, 3), input_shape=(*image_size, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) # Second convolution layer model.add(Conv2D(64, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) # Third convolution layer model.add(Conv2D(64, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) # Fully connected layers model.add(Flatten()) model.add(Dense(128)) model.add(Dropout(0.5)) model.add(Activation('relu')) # Output layer model.add(Dense(119)) model.add(Activation('softmax')) model.summary() model.compile( optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'] ) # Model training history = model.fit( train_generator, epochs=epochs, validation_data=validation_generator ) # Save the trained model model.save("malware_classifier_lime.h5")