import re import pickle import gradio as gr import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow import keras from sklearn.preprocessing import LabelEncoder # Ensure necessary NLTK resources are downloaded nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') # Load Stopwords and Initialize Lemmatizer STOPWORDS = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # Function to clean and preprocess URL data def preprocess_url(url): url = url.lower() # Convert to lowercase url = re.sub(r'https?://', '', url) # Remove http or https url = re.sub(r'www\.', '', url) # Remove www url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces tokens = word_tokenize(url) # Tokenize tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization return ' '.join(tokens) # Function to clean and preprocess HTML data def preprocess_html(html): html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags html = html.lower() # Convert to lowercase html = re.sub(r'https?://', '', html) # Remove http or https html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces tokens = word_tokenize(html) # Tokenize tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization return ' '.join(tokens) # Load trained model model = keras.models.load_model('new_phishing_detection_model.keras') # Define maximum length and number of words max_url_length = 180 max_html_length = 2000 max_words = 10000 # Load the fitted tokenizers with open('url_tokenizer.pkl', 'rb') as file: url_tokenizer = pickle.load(file) with open('html_tokenizer.pkl', 'rb') as file: html_tokenizer = pickle.load(file) # Load the label encoder with open('label_encoder.pkl', 'rb') as file: label_encoder = pickle.load(file) # Define the prediction function def predict_phishing(url, html): cleaned_url = preprocess_url(url) cleaned_html = preprocess_html(html) new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url]) new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post') new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html]) new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post') new_predictions_prob = model.predict([new_url_padded, new_html_padded]) new_predictions = (new_predictions_prob > 0.6).astype(int) # Adjust threshold if needed predicted_category = label_encoder.inverse_transform(new_predictions)[0] predicted_probability = f"{new_predictions_prob[0][0]:.4f}" return predicted_category.capitalize(), predicted_probability # Define a function to handle API calls def api_handler(url, html): predicted_category, predicted_probability = predict_phishing(url, html) return { 'predicted_category': predicted_category, 'predicted_probability': predicted_probability } # Create Gradio Interface for API interface = gr.Interface( fn=api_handler, inputs=[gr.Textbox(label="URL"), gr.Textbox(label="HTML content", lines=10)], outputs=gr.JSON(), live=False # No need for live updates ) # Launch the Gradio interface in API mode interface.launch(server_name="0.0.0.0", server_port=7860, share=True)