Spaces:

kh03
/

fraud-detection-url

Sleeping

App Files Files Community

fraud-detection-url / app.py

kh03

Upload 7 files

8d70efc verified 2 months ago

raw

history blame contribute delete

97.8 kB

	import os
	import re
	import socket
	import json
	import traceback
	import math
	import logging
	from datetime import datetime, timedelta
	from urllib.parse import urlparse
	from collections import Counter
	import requests
	import numpy as np
	import tensorflow as tf
	import pickle
	import h5py # For working with H5 files
	from flask import Flask, jsonify, request, render_template, session, flash, redirect, url_for, send_file
	from werkzeug.middleware.proxy_fix import ProxyFix
	import ssl
	from sklearn.preprocessing import StandardScaler
	from typing import Dict, List, Tuple, Optional, Union, Any
	from difflib import SequenceMatcher
	import sys
	import flask

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Try to import whois for domain registration data
	try:
	import whois
	whois_available = True
	logger.info("python-whois is available for domain registration checks")
	except ImportError:
	whois_available = False
	logger.warning("python-whois not available, domain age features will be limited")

	# Import model service - using direct path instead of package import
	import sys
	import os.path
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	from model_service import get_model, get_scaler, get_status, predict

	# Add Beautiful Soup import
	try:
	from bs4 import BeautifulSoup
	BeautifulSoup_available = True
	logger.info("BeautifulSoup is available for HTML analysis")
	except ImportError:
	BeautifulSoup_available = False
	logger.warning("BeautifulSoup not available, HTML security checks will be limited")
	BeautifulSoup = None

	# Initialize Flask app
	app = Flask(__name__)
	app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_port=1)
	app.secret_key = os.environ.get('FLASK_SECRET_KEY', 'default-secret-key')

	# Global variables for model and scaler access
	def get_model_instance():
	return get_model()

	def get_scaler_instance():
	return get_scaler()

	def is_ip(domain):
	"""
	Check if the domain is an IP address

	Args:
	domain (str): Domain to check

	Returns:
	bool: True if the domain is an IP address, False otherwise
	"""
	# IPv4 pattern
	pattern = r"^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$"

	match = re.match(pattern, domain)
	if not match:
	return False

	# Check that each octet is valid (0-255)
	for i in range(1, 5):
	octet = int(match.group(i))
	if octet < 0 or octet > 255:
	return False

	return True

	def calculate_entropy(string):
	"""
	Calculate the Shannon entropy of a string to measure randomness

	Args:
	string (str): Input string

	Returns:
	float: Shannon entropy value
	"""
	if not string:
	return 0

	# Count character occurrences
	counts = Counter(string)
	# Calculate frequencies
	frequencies = [count/len(string) for count in counts.values()]
	# Calculate entropy
	entropy = -sum(f * math.log2(f) for f in frequencies)

	return entropy

	def check_suspicious_patterns(url):
	"""Check for suspicious patterns in a URL that may indicate phishing"""
	suspicious_patterns = []

	try:
	# Parse URL
	parsed_url = urlparse(url)
	domain = parsed_url.netloc.lower()
	path = parsed_url.path.lower()
	query = parsed_url.query.lower()

	# Check for HTTP instead of HTTPS
	if parsed_url.scheme == 'http':
	suspicious_patterns.append({
	"pattern": "Insecure HTTP protocol",
	"severity": "high",
	"explanation": "The site uses HTTP instead of HTTPS, which means the connection is not encrypted.",
	"risk_score": 15
	})

	# Check for suspicious TLDs
	suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq', 'top', 'xyz', 'online', 'site', 'club', 'icu', 'pw', 'rest', 'zip']
	tld = domain.split('.')[-1] if '.' in domain else ''
	if tld in suspicious_tlds:
	suspicious_patterns.append({
	"pattern": f"Suspicious TLD: '{tld}'",
	"severity": "medium",
	"explanation": f"The domain uses a TLD ('{tld}') that is commonly associated with free domains and frequently used in phishing attacks.",
	"risk_score": 10
	})

	# Check for numeric subdomain or long subdomain
	subdomain_parts = domain.split('.')
	if len(subdomain_parts) > 2:
	subdomain = '.'.join(subdomain_parts[:-2])
	if subdomain.isdigit() or re.match(r'^\d+-\d+-\d+', subdomain):
	suspicious_patterns.append({
	"pattern": "Numeric subdomain pattern",
	"severity": "medium",
	"explanation": "The URL uses a numeric pattern in the subdomain, which is often seen in automatically generated phishing domains.",
	"risk_score": 10
	})
	elif len(subdomain) > 20:
	suspicious_patterns.append({
	"pattern": "Unusually long subdomain",
	"severity": "medium",
	"explanation": "The subdomain is unusually long, which is often a characteristic of phishing URLs trying to obscure their true nature.",
	"risk_score": 5
	})

	# Check for URL shortening services
	shortening_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'is.gd',
	'buff.ly', 'ow.ly', 'rebrand.ly', 'tr.im']
	# Modified check to prevent false positives
	is_shortener = False
	domain_parts = domain.split('.')
	base_domain = '.'.join(domain_parts[-2:]) if len(domain_parts) > 1 else domain

	# First check exact domain match
	if any(base_domain == service for service in shortening_services):
	is_shortener = True
	# Then check subdomain match (e.g., sub.bit.ly)
	elif any(domain.endswith('.' + service) for service in shortening_services):
	is_shortener = True

	if is_shortener:
	suspicious_patterns.append({
	"pattern": "URL shortening service",
	"severity": "medium",
	"explanation": "The URL uses a shortening service, which can hide the actual destination.",
	"risk_score": 8
	})

	# Check for suspicious words in URL
	suspicious_words = ['login', 'signin', 'verify', 'secure', 'account', 'update', 'confirm',
	'password', 'credential', 'wallet', 'authenticate', 'verification',
	'banking', 'security', 'alert', 'suspended', 'unusual']
	found_words = [word for word in suspicious_words if word in url.lower()]
	if found_words:
	words_str = ', '.join(found_words)
	suspicious_patterns.append({
	"pattern": f"Suspicious keywords: {words_str}",
	"severity": "medium",
	"explanation": f"The URL contains words often associated with phishing attempts that try to create urgency or request credentials.",
	"risk_score": 12
	})

	# Check for IP address as domain
	if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain):
	suspicious_patterns.append({
	"pattern": "IP address used as domain",
	"severity": "high",
	"explanation": "The URL uses an IP address instead of a domain name, which is rarely done for legitimate websites and often indicates phishing.",
	"risk_score": 25
	})

	# Check for excessive number of dots in domain
	if domain.count('.') > 3:
	suspicious_patterns.append({
	"pattern": "Excessive subdomains",
	"severity": "medium",
	"explanation": "The URL contains an unusually high number of subdomains, which can be an attempt to confuse users.",
	"risk_score": 8
	})

	# Check for excessive URL length
	if len(url) > 100:
	suspicious_patterns.append({
	"pattern": "Excessively long URL",
	"severity": "medium",
	"explanation": "The URL is unusually long, which can be an attempt to hide suspicious elements.",
	"risk_score": 5
	})

	# Check for presence of @ symbol in URL
	if '@' in url:
	suspicious_patterns.append({
	"pattern": "@ symbol in URL",
	"severity": "high",
	"explanation": "The URL contains an @ symbol, which can be used to trick users by hiding the actual destination.",
	"risk_score": 20
	})

	# Check for excessive number of special characters
	special_char_count = sum(c in '!@#$%^&*()_+-={}[]\|\\:;"\'<>,.?/' for c in url)
	if special_char_count > 15:
	suspicious_patterns.append({
	"pattern": "Excessive special characters",
	"severity": "medium",
	"explanation": "The URL contains an unusually high number of special characters, which can be an attempt to obfuscate malicious content.",
	"risk_score": 10
	})

	# If no patterns were found but domain can't be resolved
	if not suspicious_patterns:
	try:
	socket.gethostbyname(domain)
	except:
	suspicious_patterns.append({
	"pattern": "Domain does not resolve",
	"severity": "high",
	"explanation": "The domain cannot be resolved to an IP address, which means it may not exist or may be newly registered for phishing.",
	"risk_score": 20
	})

	logger.info(f"Suspicious patterns found for {url}: {len(suspicious_patterns)}")
	return suspicious_patterns
	except Exception as e:
	logger.error(f"Error checking suspicious patterns: {e}")
	return []

	def rule_based_prediction(url, scaled_features=None):
	"""
	Rule-based prediction when model is unavailable

	Args:
	url: URL to analyze
	scaled_features: Optional feature array

	Returns:
	float: Risk score (0-100)
	"""
	try:
	# Parse the URL
	parsed_url = urlparse(url)
	domain = parsed_url.netloc.lower()
	path = parsed_url.path.lower()

	# Initialize risk score
	risk_score = 0
	risk_factors = {}

	# 1. Basic protocol check (part of URL features - 40%)
	if parsed_url.scheme != 'https':
	risk_score += 20
	risk_factors["insecure_protocol"] = {
	"description": "The site uses HTTP instead of HTTPS",
	"impact": "high",
	"contribution": 20
	}

	# 2. Domain-based checks (part of URL features - 40%)
	if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain):
	# IP address as domain
	risk_score += 25
	risk_factors["ip_as_domain"] = {
	"description": "IP address used as domain instead of a domain name",
	"impact": "high",
	"contribution": 25
	}

	# Check for suspicious TLDs
	suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq', 'top', 'xyz', 'online', 'site']
	tld = domain.split('.')[-1] if '.' in domain else ''
	if tld in suspicious_tlds:
	risk_score += 15
	risk_factors["suspicious_tld"] = {
	"description": f"Domain uses suspicious TLD (.{tld})",
	"impact": "medium",
	"contribution": 15
	}

	# Check domain length
	if len(domain) > 30:
	risk_score += 10
	risk_factors["long_domain"] = {
	"description": "Unusually long domain name",
	"impact": "medium",
	"contribution": 10
	}

	# Check for excessive subdomains
	if domain.count('.') > 3:
	risk_score += 15
	risk_factors["excessive_subdomains"] = {
	"description": f"Domain has {domain.count('.')} subdomains",
	"impact": "medium",
	"contribution": 15
	}

	# 3. URL structure checks (part of URL features - 40%)
	if len(url) > 100:
	risk_score += 10
	risk_factors["long_url"] = {
	"description": "Excessively long URL",
	"impact": "medium",
	"contribution": 10
	}

	# Check for suspicious keywords
	suspicious_words = ['login', 'signin', 'verify', 'secure', 'account', 'update', 'confirm',
	'password', 'credential', 'wallet', 'authenticate', 'verification']
	keyword_count = 0
	for word in suspicious_words:
	if word in url.lower():
	keyword_count += 1
	risk_score += 5
	# Cap keyword penalty at 30
	if risk_score > 30:
	break

	if keyword_count > 0:
	risk_factors["suspicious_keywords"] = {
	"description": f"URL contains {keyword_count} suspicious keywords",
	"impact": "medium",
	"contribution": min(keyword_count * 5, 30)
	}

	# Check special characters
	special_char_count = sum(c in '!@#$%^&*()_+-={}[]\|\\:;"\'<>,.?/' for c in url)
	risk_score += min(special_char_count, 15)

	if special_char_count > 5:
	risk_factors["special_chars"] = {
	"description": f"URL contains {special_char_count} special characters",
	"impact": "low" if special_char_count < 10 else "medium",
	"contribution": min(special_char_count, 15)
	}

	# Check for URL shortening services
	shortening_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'is.gd']
	if any(service in domain for service in shortening_services):
	risk_score += 15
	risk_factors["url_shortener"] = {
	"description": "Uses URL shortening service",
	"impact": "medium",
	"contribution": 15
	}

	# 4. Check if trusted domain
	if is_trusted_domain(url):
	risk_score = max(0, risk_score - 40) # Significant reduction for trusted domains
	risk_factors["trusted_domain"] = {
	"description": "Domain is in trusted list",
	"impact": "positive",
	"contribution": -40
	}

	# 5. Add results from suspicious patterns check (30%)
	suspicious_patterns = check_suspicious_patterns(url)
	pattern_risk = sum(p.get("risk_score", 0) for p in suspicious_patterns)
	risk_score += pattern_risk

	if pattern_risk > 0:
	risk_factors["suspicious_patterns"] = {
	"description": f"Found {len(suspicious_patterns)} suspicious patterns",
	"impact": "high" if pattern_risk > 20 else "medium",
	"contribution": pattern_risk
	}

	# 6. Try to resolve domain (part of domain information - 10%)
	domain_info = get_domain_info(url)
	domain_penalty = 0

	if domain_info.get("ip_address") == "Could not resolve":
	# Domain cannot be resolved, apply significant penalty
	domain_penalty = 10 # 10% of total score as penalty
	risk_score += domain_penalty
	risk_factors["unresolvable_domain"] = {
	"description": "Domain could not be resolved to an IP address",
	"impact": "high",
	"contribution": domain_penalty
	}
	else:
	# Check country risk if domain could be resolved
	high_risk_countries = ["RU", "CN", "IR", "KP", "NG"]
	country = domain_info.get("country", "Unknown")

	if country in high_risk_countries:
	country_penalty = 5
	risk_score += country_penalty
	risk_factors["high_risk_country"] = {
	"description": f"Domain hosted in high-risk country ({country})",
	"impact": "medium",
	"contribution": country_penalty
	}

	# 7. Consider HTML content risk if available (20%)
	try:
	html_security = check_html_security(url)
	html_risk = html_security.get("content_score", 0) / 5 # Scale down from 0-100 to 0-20
	risk_score += html_risk

	if html_risk > 0:
	risk_factors["html_content"] = {
	"description": f"HTML content has suspicious elements",
	"impact": "high" if html_risk > 10 else "medium",
	"contribution": html_risk
	}
	except Exception as e:
	logger.error(f"Error checking HTML security: {e}")

	# Ensure final score is within 0-100 range
	final_score = max(0, min(100, risk_score))

	# Create the result dictionary
	result = {
	"status": "success",
	"url": url,
	"score": final_score,
	"risk_level": get_risk_level(final_score),
	"risk_factors": risk_factors,
	"using_fallback": True,
	"domain_info": domain_info,
	"suspicious_patterns": suspicious_patterns
	}

	return result
	except Exception as e:
	logger.error(f"Error in rule_based_prediction: {e}")
	# Default moderate risk on error
	return {
	"status": "error",
	"url": url,
	"score": 50, # Default moderate risk
	"risk_level": get_risk_level(50),
	"using_fallback": True,
	"error": str(e)
	}

	def is_trusted_domain(url):
	"""
	Check if a URL belongs to a trusted domain

	Args:
	url (str): URL to check

	Returns:
	bool: True if the domain is trusted, False otherwise
	"""
	try:
	# Parse the URL to extract the domain
	parsed_url = urlparse(url)
	domain = parsed_url.netloc.lower()

	# Remove www. prefix if present
	if domain.startswith('www.'):
	domain = domain[4:]

	# List of trusted domains
	trusted_domains = [
	'google.com', 'gmail.com', 'youtube.com',
	'facebook.com', 'instagram.com', 'twitter.com', 'x.com',
	'microsoft.com', 'office.com', 'outlook.com', 'linkedin.com',
	'apple.com', 'icloud.com', 'amazon.com', 'paypal.com',
	'github.com', 'dropbox.com', 'netflix.com', 'spotify.com',
	'wikipedia.org', 'adobe.com', 'cloudflare.com',
	'wordpress.com', 'yahoo.com', 'twitch.tv',
	'reddit.com', 'pinterest.com', 'ebay.com',
	'zoom.us', 'slack.com', 'shopify.com'
	]

	# Check if domain ends with any trusted domain
	return any(domain == td or domain.endswith('.' + td) for td in trusted_domains)
	except Exception as e:
	logger.error(f"Error in is_trusted_domain: {e}")
	return False

	# Create a custom InputLayer that can handle batch_shape
	class CompatibleInputLayer(tf.keras.layers.InputLayer):
	def __init__(self, **kwargs):
	# Handle the batch_shape case
	if 'batch_shape' in kwargs:
	input_shape = kwargs.pop('batch_shape')
	if input_shape is not None and len(input_shape) > 1:
	kwargs['input_shape'] = input_shape[1:]
	super().__init__(**kwargs)

	def tld_risk_score(tld: str) -> float:
	"""
	Calculate risk score for top-level domains.
	Some TLDs are more associated with fraudulent activity than others.

	Args:
	tld: Top-level domain (e.g., 'com', 'org')

	Returns:
	float: Risk score between 0 and 1
	"""
	risky_tlds = {
	'xyz': 0.7, 'top': 0.65, 'loan': 0.85, 'bid': 0.8,
	'online': 0.75, 'site': 0.7, 'club': 0.65, 'stream': 0.8,
	'icu': 0.75, 'live': 0.6, 'vip': 0.7, 'fit': 0.6,
	'tk': 0.8, 'ml': 0.75, 'ga': 0.75, 'cf': 0.7
	}
	return risky_tlds.get(tld.lower(), 0.2)

	def extract_features(url: str):
	"""
	Extract features from a URL for machine learning prediction

	Args:
	url: URL to analyze

	Returns:
	tuple: (feature_dict, feature_array)
	"""
	logger.info(f"Extracting features for URL: {url}")

	try:
	# Parse the URL
	parsed_url = urlparse(url)

	# Basic URL components
	domain = parsed_url.netloc.lower()
	path = parsed_url.path.lower()
	query = parsed_url.query.lower()
	fragment = parsed_url.fragment.lower()

	# Basic Feature extraction (original features)
	# Length-based features
	url_length = len(url)
	domain_length = len(domain)
	path_length = len(path)
	query_length = len(query)
	fragment_length = len(fragment)

	# Domain-based features
	subdomain_count = domain.count('.') - 1 if '.' in domain else 0
	subdomain_count = max(0, subdomain_count) # Ensure non-negative

	# Path-based features
	path_depth = path.count('/') if path else 0

	# Get TLD risk score
	tld = domain.split('.')[-1] if '.' in domain else ''
	tld_score = tld_risk_score(tld)

	# Calculate entropy as a measure of randomness
	domain_entropy = calculate_entropy(domain)

	# Security features
	https_present = 1 if parsed_url.scheme == 'https' else 0

	# Character-based features
	special_char_count = sum(c in '!@#$%^&*()_+-={}[]\|\\:;"\'<>,.?/' for c in url)
	digit_count = sum(c.isdigit() for c in url)
	letter_count = sum(c.isalpha() for c in url)

	digit_percentage = (digit_count / len(url)) * 100 if len(url) > 0 else 0
	letter_percentage = (letter_count / len(url)) * 100 if len(url) > 0 else 0

	# Check if path is all numeric
	numeric_path = 1 if path and all(c.isdigit() or c == '/' for c in path) else 0

	# Suspicious patterns
	ip_url = 1 if re.match(r'\d+\.\d+\.\d+\.\d+', domain) else 0

	# Looking for suspicious keywords
	suspicious_keywords = ['login', 'signin', 'account', 'secure', 'update', 'verify',
	'confirm', 'banking', 'payment', 'wallet', 'ebay', 'paypal']
	keyword_count = sum(1 for keyword in suspicious_keywords if keyword in url.lower())

	# Create a dictionary of basic feature names and values
	basic_features = {
	"url_length": url_length,
	"domain_length": domain_length,
	"path_length": path_length,
	"query_length": query_length,
	"fragment_length": fragment_length,
	"subdomain_count": subdomain_count,
	"path_depth": path_depth,
	"tld_score": tld_score,
	"domain_entropy": domain_entropy,
	"https_present": https_present,
	"special_char_count": special_char_count,
	"digit_percentage": digit_percentage,
	"letter_percentage": letter_percentage,
	"numeric_path": numeric_path,
	"ip_url": ip_url,
	"keyword_count": keyword_count
	}

	# Get domain information for additional features
	domain_info = get_domain_info(url)
	ip_address = domain_info.get("ip_address", "Unknown")

	# NEW: Extract enhanced features
	whois_features = extract_whois_features(domain)
	nlp_features = extract_nlp_features(domain)
	reputation_features = extract_reputation_features(domain, ip_address)

	# Try to get content features - might fail if site is down
	content_features = {}
	html_security = {}
	try:
	# Reuse HTML content if we can get it once
	response = requests.get(url, timeout=10,
	headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
	html_content = response.text

	# Extract content features from the HTML
	content_features = extract_content_features(url, html_content)

	# Get HTML security data
	if BeautifulSoup_available:
	html_security_data = check_html_security(url, html_content)
	html_security = {
	"security_score": html_security_data.get("content_score", 0),
	"risk_factor_count": len(html_security_data.get("risk_factors", [])),
	"has_password_field": 1 if any("password" in rf.lower() for rf in html_security_data.get("risk_factors", [])) else 0,
	"has_obfuscated_js": 1 if any("obfuscated" in rf.lower() for rf in html_security_data.get("risk_factors", [])) else 0
	}
	except Exception as content_error:
	logger.warning(f"Could not extract content features: {content_error}")
	content_features = extract_content_features(url) # Empty defaults
	html_security = {"security_score": 0, "risk_factor_count": 0, "has_password_field": 0, "has_obfuscated_js": 0}

	# Try to get certificate transparency log features
	ct_features = extract_ct_log_features(domain)

	# Combine all features into a single dictionary
	all_features = {**basic_features}

	# Add new feature groups with prefixes to avoid name collisions
	for key, value in whois_features.items():
	all_features[f"whois_{key}"] = value

	for key, value in nlp_features.items():
	all_features[f"nlp_{key}"] = value

	for key, value in reputation_features.items():
	all_features[f"rep_{key}"] = value

	for key, value in content_features.items():
	all_features[f"content_{key}"] = value

	for key, value in html_security.items():
	all_features[f"html_{key}"] = value

	for key, value in ct_features.items():
	all_features[f"ct_{key}"] = value

	# Add additional domain info features
	all_features["geo_suspicious_country"] = 1 if domain_info.get("country") in ["RU", "CN", "IR", "KP"] else 0

	# Convert feature dictionary to array for the model
	# Extract values in a stable order for the model
	basic_feature_values = list(basic_features.values())

	# Create a list of values for the additional features
	additional_values = []
	for key in sorted(all_features.keys()):
	if key not in basic_features:
	additional_values.append(all_features[key])

	# Full feature array - basic features plus new features
	full_features = basic_feature_values + additional_values

	# Convert to numpy array
	base_features = np.array(full_features, dtype=np.float32)

	# Pad to expected size for model compatibility (should be 96 for your model)
	# Adjust padding as needed based on your model's expectations
	padding_size = max(0, 96 - len(full_features))
	if padding_size > 0:
	padding = np.zeros(padding_size, dtype=np.float32)
	feature_array = np.concatenate([base_features, padding])
	else:
	# If we have more features than expected, truncate to 96
	feature_array = base_features[:96]

	# Log feature count
	logger.info(f"Extracted {len(full_features)} features, adjusted to {len(feature_array)} for model compatibility")

	return all_features, feature_array

	except Exception as e:
	logger.error(f"Error extracting features: {e}")
	logger.error(traceback.format_exc())
	# Return default values in case of error
	feature_dict = {"error": str(e)}
	feature_array = np.zeros(96, dtype=np.float32)
	return feature_dict, feature_array

	def check_html_security(url, html_content=None):
	"""
	Check HTML content for suspicious or malicious patterns

	Args:
	url: URL to analyze
	html_content: Optional pre-fetched HTML content

	Returns:
	dict: Dictionary with security information
	"""
	if not BeautifulSoup_available:
	return {
	"error": "BeautifulSoup not available",
	"content_score": 0,
	"risk_factors": ["Unable to analyze HTML content - BeautifulSoup not installed"]
	}

	try:
	# Get the HTML content if not provided)
	if html_content is None:
	response = requests.get(url, timeout=10,
	headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
	html_content = response.text

	# Parse HTML
	soup = BeautifulSoup(html_content, 'html.parser')

	# Initialize security data
	security_data = {
	"content_score": 0, # 0-100 scale, higher means more risky
	"risk_factors": [], # List of risk factors found
	"security_checks": [] # List of security checks passed
	}

	# Check 1: Forms without HTTPS action
	forms = soup.find_all("form")
	insecure_forms = [f for f in forms if f.get('action') and f.get('action').startswith('http://')]

	if insecure_forms:
	security_data["content_score"] += 30
	security_data["risk_factors"].append(f"Found {len(insecure_forms)} form(s) submitting to insecure HTTP")

	# Check 2: Password inputs
	password_inputs = soup.find_all("input", {"type": "password"})
	if password_inputs:
	security_data["content_score"] += 15
	security_data["risk_factors"].append(f"Found {len(password_inputs)} password input(s)")

	# Check if password input is in an insecure form
	for p_input in password_inputs:
	parent_form = p_input.find_parent("form")
	if parent_form and parent_form.get('action') and parent_form.get('action').startswith('http://'):
	security_data["content_score"] += 25
	security_data["risk_factors"].append("Password being submitted over insecure HTTP")
	break

	# Check 3: Hidden inputs with suspicious names
	suspicious_hidden = soup.find_all("input", {"type": "hidden", "name": re.compile(r'user\|email\|account\|pass\|auth\|token\|id\|login', re.I)})
	if suspicious_hidden:
	security_data["content_score"] += 10
	security_data["risk_factors"].append(f"Found {len(suspicious_hidden)} hidden fields with suspicious names")

	# Check 4: Scripts with suspicious URLs or obfuscated code
	scripts = soup.find_all("script")
	obfuscated_scripts = 0
	suspicious_urls = 0

	for script in scripts:
	if script.string:
	# Check for obfuscated code patterns
	if re.search(r'eval\(', script.string) or re.search(r'\\x[0-9a-f]{2}', script.string):
	obfuscated_scripts += 1

	# Check for suspicious URLs in scripts
	if re.search(r'(https?://[^\'"]+\.(xyz\|tk\|ml\|ga\|cf\|gq\|top))', script.string):
	suspicious_urls += 1

	if obfuscated_scripts > 0:
	security_data["content_score"] += 20
	security_data["risk_factors"].append(f"Found {obfuscated_scripts} script(s) with potentially obfuscated code")

	if suspicious_urls > 0:
	security_data["content_score"] += 15
	security_data["risk_factors"].append(f"Found {suspicious_urls} script(s) with suspicious URLs")

	# Check 5: Excessive use of iframes
	iframes = soup.find_all("iframe")
	if len(iframes) > 3:
	security_data["content_score"] += 10
	security_data["risk_factors"].append(f"Excessive use of iframes ({len(iframes)} found)")

	# Add passed security checks
	if not insecure_forms:
	security_data["security_checks"].append("No insecure forms found")

	if not password_inputs:
	security_data["security_checks"].append("No password inputs found")

	if security_data["content_score"] < 20:
	security_data["security_checks"].append("Low-risk HTML content")

	# Add HTTPS security check if URL uses HTTPS
	if url.startswith("https://"):
	security_data["security_checks"].append("HTTPS protocol used")

	return security_data
	except Exception as e:
	logger.error(f"Error checking HTML security: {e}")
	return {
	"error": str(e),
	"content_score": 0,
	"risk_factors": [f"Error analyzing HTML content: {str(e)}"]
	}

	def predict_with_model(url, features=None):
	"""
	Make a prediction using the loaded model.
	If model is not available, falls back to rule-based prediction.

	Args:
	url: URL to predict
	features: Optional pre-computed features

	Returns:
	dict: Prediction result with risk score and details
	"""
	try:
	logger.info(f"Making prediction for URL: {url}")

	# Extract features if not provided
	if features is None:
	logger.info("No features provided, extracting new features")
	features, feature_vector = extract_features(url)
	else:
	logger.info(f"Features provided, type: {type(features)}")
	# The feature parameter might be just the dictionary without the feature_vector
	# Always re-extract to get the proper numpy array
	_, feature_vector = extract_features(url)

	# Initialize response
	result = {
	"status": "success",
	"url": url,
	"score": 0,
	"risk_level": "Unknown",
	"feature_contributions": [],
	"risk_factors": {},
	"domain_info": {},
	"suspicious_patterns": []
	}

	# Check if model is available
	if get_model_instance() is not None and get_scaler_instance() is not None:
	try:
	# Ensure feature_vector is a numpy array before reshaping
	if not isinstance(feature_vector, np.ndarray):
	logger.error(f"feature_vector is not a numpy array: {type(feature_vector)}")
	# Fall back to rule-based prediction
	return rule_based_prediction(url, features)

	# Prepare feature vector for prediction
	features_reshaped = feature_vector.reshape(1, -1)
	logger.info(f"Feature shape: {features_reshaped.shape}")

	# Scale features if scaler is available
	scaled_features = get_scaler_instance().transform(features_reshaped)

	# Make prediction
	prediction = get_model_instance().predict(scaled_features)
	raw_score = float(prediction[0][0]) if hasattr(prediction, 'shape') else float(prediction)
	score = raw_score * 100 # Convert to percentage

	logger.info(f"Model prediction raw score: {raw_score}, scaled: {score}")

	# Set result fields
	result["score"] = score
	result["raw_score"] = raw_score
	result["risk_level"] = get_risk_level(score)

	# Handle unresolvable domains - apply domain information penalty (10% of total score)
	domain_info = get_domain_info(url)
	if domain_info.get("ip_address") == "Could not resolve":
	# Apply domain information penalty (add up to 10 points to the risk score)
	domain_penalty = 10.0 # Maximum penalty for unresolvable domains (10% of total score)
	original_score = score
	score = min(100, score + domain_penalty) # Cap at 100
	result["score"] = score
	logger.info(f"Domain could not be resolved, applying penalty: {original_score} -> {score}")

	# Add a risk factor for unresolvable domain
	if "risk_factors" not in result:
	result["risk_factors"] = {}
	result["risk_factors"]["unresolvable_domain"] = {
	"description": "Domain could not be resolved to an IP address",
	"impact": "high",
	"contribution": domain_penalty
	}

	# Feature name mapping to user-friendly names
	feature_name_map = {
	"url_length": "URL Length",
	"domain_length": "Domain Length",
	"path_length": "Path Length",
	"query_length": "Query Parameters Length",
	"fragment_length": "Fragment Length",
	"subdomain_count": "Number of Subdomains",
	"path_depth": "Path Depth",
	"tld_score": "Risky TLD Score",
	"domain_entropy": "Domain Entropy",
	"https_present": "Security Weights",
	"special_char_count": "Special Characters",
	"digit_percentage": "Digit Percentage",
	"letter_percentage": "Letter Percentage",
	"numeric_path": "Numeric Path Present",
	"ip_url": "IP as Domain",
	"keyword_count": "Suspicious Keywords",
	# Content feature friendly names
	"content_page_size_bytes": "Page Size",
	"content_external_resources_count": "External Resource Count",
	"content_form_count": "Form Count",
	"content_password_field_count": "Password Fields",
	"content_js_to_html_ratio": "JavaScript to HTML Ratio",
	"content_title_brand_mismatch": "Title-Domain Mismatch",
	"content_favicon_exists": "Favicon Present",
	"content_similar_domain_redirect": "Similar Domain Redirect",
	# HTML security feature friendly names
	"html_security_score": "HTML Security Score",
	"html_risk_factor_count": "Security Risk Factor Count",
	"html_has_password_field": "Contains Password Field",
	"html_has_obfuscated_js": "Contains Obfuscated JavaScript",
	# SSL certificate feature friendly names
	"ct_suspicious_cert_pattern": "Suspicious Certificate Pattern",
	# Geographic feature friendly names
	"geo_suspicious_country": "Suspicious Country"
	}

	# Add feature contributions
	result["feature_contributions"] = []
	if isinstance(features, dict):
	for name, value in features.items():
	if name != "error":
	# Estimate contribution based on feature value and type
	contribution = 0.0
	section = "Key Risk Factors" # Default section

	# ====== Core URL & Domain Features (High Impact) ======
	if name == "url_length" and value > 50:
	contribution = 0.1 * (value / 100)
	section = "Key Risk Factors"
	elif name == "domain_length" and value > 15:
	contribution = 0.15 * (value / 30)
	section = "Key Risk Factors"
	elif name == "domain_entropy" and value > 0:
	contribution = 0.1 * min(value / 3.0, 1.0)
	section = "Key Risk Factors"
	elif name == "special_char_count" and value > 3:
	contribution = 0.1 * (value / 10)
	section = "Key Risk Factors"
	elif name == "tld_score" and value > 0:
	contribution = 0.15 * value / 0.5 # Scale based on value
	section = "Key Risk Factors"
	elif name == "https_present" and value < 1:
	contribution = 24.6 # Fixed percentage for consistency
	section = "Key Risk Factors"

	# ====== Domain Reputation & WHOIS Features (Important) ======
	elif name == "rep_domain_age_category" and value < 2:
	contribution = 0.15 * (2 - value) / 2 # Newer domains are riskier
	section = "Domain Information"
	elif name == "rep_suspicious_tld_category" and value > 0:
	contribution = 0.15 * value # TLD category risk
	section = "Domain Information"
	elif name == "rep_suspicious_country" and value > 0:
	contribution = 0.15 # Suspicious country
	section = "Domain Information"
	elif name == "whois_recently_registered" and value > 0:
	contribution = 0.2 # Recently registered domains are highly suspicious
	section = "Domain Information"

	# ====== Critical HTML Content Features (Highest Impact) ======
	elif name == "content_form_count" and value > 0:
	contribution = 0.15 * min(value / 2, 1.0) # Forms are key phishing indicators
	section = "Suspicious Patterns"
	elif name == "content_password_field_count" and value > 0:
	contribution = 0.3 * min(value / 2.0, 1.0) # Password fields are critical for phishing
	section = "Suspicious Patterns"
	elif name == "content_external_resources_count" and value > 3:
	contribution = 0.12 * min(value / 15, 1.0) # External resources
	section = "Suspicious Patterns"
	elif name == "content_js_to_html_ratio" and value > 0.3:
	contribution = 0.15 * min(value / 0.5, 1.0) # High JS ratio can indicate obfuscation
	section = "Suspicious Patterns"
	elif name == "content_title_brand_mismatch" and value > 0:
	contribution = 0.2 # Title not matching domain is suspicious
	section = "Suspicious Patterns"
	elif name == "content_similar_domain_redirect" and value > 0:
	contribution = 0.35 # Redirects to similar domains are highly suspicious
	section = "Suspicious Patterns"
	elif name == "content_favicon_exists" and value < 1:
	contribution = 0.08 # Missing favicon often indicates phishing
	section = "Key Risk Factors"

	# ====== HTML Security Metrics (High Impact) ======
	elif name == "html_security_score" and value > 0:
	contribution = 0.2 * min(value / 50, 1.0) # Overall security score
	section = "Suspicious Patterns"
	elif name == "html_risk_factor_count" and value > 0:
	contribution = 0.15 * min(value / 3, 1.0) # Number of risks found
	section = "Suspicious Patterns"
	elif name == "html_has_password_field" and value > 0:
	contribution = 0.25 # Password fields in HTML are suspicious
	section = "Suspicious Patterns"
	elif name == "html_has_obfuscated_js" and value > 0:
	contribution = 0.3 # Obfuscated JavaScript is highly suspicious
	section = "Suspicious Patterns"

	# ====== SSL Certificate Features (Medium Impact) ======
	elif name == "ct_suspicious_cert_pattern" and value > 0:
	contribution = 0.15 # Suspicious certificate patterns
	section = "Domain Information"

	# ====== Geographic Features (Medium Impact) ======
	elif name == "geo_suspicious_country" and value > 0:
	contribution = 0.15 # Suspicious country
	section = "Domain Information"

	# Use friendly name if available
	display_name = feature_name_map.get(name, name.replace("_", " ").title())

	# Determine color based on contribution
	color_class = "success" # Default green
	if contribution > 60:
	color_class = "danger" # Red for high risk
	elif contribution > 20:
	color_class = "warning" # Orange for medium risk

	result["feature_contributions"].append({
	"name": name,
	"value": value,
	"contribution": contribution,
	"direction": "increases" if contribution > 0 else "decreases",
	"percentage": contribution, # No need to convert for HTTPS present
	"feature_name": display_name,
	"color_class": color_class,
	"section": section # Add section to each feature
	})

	# Normalize contributions to match total risk score, but preserve HTTPS percentage
	if result["feature_contributions"]:
	# Sort by contribution (descending)
	result["feature_contributions"].sort(key=lambda x: -x["percentage"])

	# Get total of all contributions
	total_contribution = sum(item["percentage"] for item in result["feature_contributions"]
	if item["name"] != "https_present")
	https_contribution = next((item["percentage"] for item in result["feature_contributions"]
	if item["name"] == "https_present"), 0)

	# Calculate what's left for other features
	remaining_score = max(0, score - https_contribution)

	# If total is > 0, normalize the remaining features
	if total_contribution > 0 and remaining_score > 0:
	normalization_factor = remaining_score / total_contribution
	for item in result["feature_contributions"]:
	if item["name"] != "https_present":
	item["percentage"] = round(item["percentage"] * normalization_factor, 1)

	# Calculate section totals based on fixed weights
	# URL features (40%), Domain info (10%), Suspicious patterns (50%)
	section_weights = {
	"Key Risk Factors": 40.0, # URL features (40%)
	"Domain Information": 10.0, # Domain information (10%)
	"Suspicious Patterns": 50.0 # Suspicious patterns (50%)
	}

	# Use fixed weights but distribute actual feature contributions within them
	total_feature_impact = sum(item["percentage"] for item in result["feature_contributions"])
	if total_feature_impact > 0:
	# Normalize all feature impacts to a 0-100 scale
	normalization_factor = score / total_feature_impact
	for item in result["feature_contributions"]:
	item["percentage"] = round(item["percentage"] * normalization_factor, 1)

	# Calculate actual section distribution based on feature categorization
	actual_section_totals = {
	"Key Risk Factors": 0,
	"Domain Information": 0,
	"Suspicious Patterns": 0
	}

	for item in result["feature_contributions"]:
	section = item["section"]
	if section in actual_section_totals:
	actual_section_totals[section] += item["percentage"]

	# Ensure the overall risk score is preserved
	result["section_totals"] = {
	# Use fixed weights but make sure they sum to the overall score
	"Key Risk Factors": round((section_weights["Key Risk Factors"] / 100) * score, 1),
	"Domain Information": round((section_weights["Domain Information"] / 100) * score, 1),
	"Suspicious Patterns": round((section_weights["Suspicious Patterns"] / 100) * score, 1)
	}

	# Get suspicious patterns
	suspicious_patterns = check_suspicious_patterns(url)
	result["suspicious_patterns"] = suspicious_patterns

	# Get domain information with more detail
	domain_info = get_domain_info(url)

	# Try to enhance domain info with more details if possible
	try:
	# Parse URL to get domain
	parsed_url = urlparse(url)
	domain = parsed_url.netloc

	# Try to get more domain info using socket
	if not domain_info.get("organization"):
	try:
	ip = socket.gethostbyname(domain)
	domain_info["ip_address"] = ip

	# Try to determine organization and location from IP
	# Note: In a real implementation, you'd use a GeoIP service here
	domain_info["organization"] = "Unknown Organization"
	domain_info["country"] = "Unknown Country"
	domain_info["city"] = "Unknown City"
	except Exception as e:
	logger.warning(f"Could not enhance domain info: {e}")
	except Exception as e:
	logger.warning(f"Error enhancing domain info: {e}")

	result["domain_info"] = domain_info

	# Add HTML security data if available
	html_security = None
	try:
	html_security = check_html_security(url)
	result["html_security"] = html_security
	except Exception as e:
	logger.error(f"Error checking HTML security: {e}")

	# Explicitly add feature_table for UI
	result['feature_table'] = []

	# Process features and organize by category
	for key, value in features.items():
	if key != "error":
	# Find the corresponding contribution
	impact = 0.0
	color_class = "success"

	for contrib in result["feature_contributions"]:
	if contrib["name"] == key:
	impact = contrib["percentage"]
	color_class = contrib["color_class"]
	break

	# Use friendly name if available
	display_name = feature_name_map.get(key, key.replace("_", " ").title())

	# Always include HTTPS with fixed impact
	if key == "https_present" and value < 1:
	result['feature_table'].append({
	'feature': "Security Weights",
	'value': "No" if value < 1 else "Yes",
	'impact': 24.6, # Fixed percentage
	'color_class': "danger"
	})
	# Only include features with significant impact or specifically important ones
	elif impact > 3 or key in ["tld_score", "content_password_field_count",
	"content_form_count", "html_security_score",
	"domain_entropy", "content_favicon_exists",
	"rep_domain_age_category"]:
	# Format value based on type
	formatted_value = value # Default value
	if isinstance(value, bool) or (isinstance(value, (int, float)) and value in [0, 1]):
	formatted_value = "No" if value == 0 or value is False else "Yes"
	elif isinstance(value, float) and value < 1:
	formatted_value = round(value, 2)

	# Append to feature table
	result['feature_table'].append({
	'feature': display_name,
	'value': formatted_value,
	'impact': impact,
	'color_class': color_class
	})

	# Sort feature_table by impact (descending)
	result['feature_table'] = sorted(
	result['feature_table'],
	key=lambda x: -x['impact']
	)

	return result

	except Exception as e:
	logger.error(f"Error making prediction with model: {e}")
	logger.error(traceback.format_exc())
	# Fall back to rule-based prediction

	# Rule-based prediction (fallback)
	logger.info("Using rule-based prediction as fallback")
	return rule_based_prediction(url, features)

	except Exception as e:
	logger.error(f"Unexpected error in predict_with_model: {e}")
	logger.error(traceback.format_exc())
	return {
	"status": "error",
	"url": url,
	"message": f"Error making prediction: {str(e)}",
	"using_fallback": True,
	"score": 50, # Default moderate risk
	"risk_level": "moderate",
	"domain_info": get_domain_info(url),
	"suspicious_patterns": check_suspicious_patterns(url)
	}

	def get_risk_level(score):
	"""
	Convert numerical risk score to categorical risk level

	Args:
	score: Numerical risk score (0-100)

	Returns:
	str: Risk level category
	"""
	if score < 20:
	return "low"
	elif score < 50:
	return "moderate"
	elif score < 75:
	return "high"
	else:
	return "critical"

	def get_domain_info(url):
	"""
	Get information about a domain

	Args:
	url: URL to get domain info for

	Returns:
	dict: Domain information including IP, organization, location
	"""
	try:
	# Parse the URL to extract domain
	parsed_url = urlparse(url)
	domain = parsed_url.netloc

	# Extract domain without port if present
	if ':' in domain:
	domain = domain.split(':')[0]

	# Initialize domain info
	domain_info = {
	"domain": domain,
	"ip_address": "Unknown",
	"organization": "Unknown",
	"country": "Unknown",
	"city": "Unknown",
	"created": "Unknown",
	"expires": "Unknown",
	"latitude": 0,
	"longitude": 0
	}

	# Try to get IP address
	try:
	ip_address = socket.gethostbyname(domain)
	domain_info["ip_address"] = ip_address

	# Use ip-api.com for geolocation data
	try:
	geo_response = requests.get(f"http://ip-api.com/json/{ip_address}", timeout=5)
	if geo_response.status_code == 200:
	geo_data = geo_response.json()
	if geo_data.get("status") == "success":
	domain_info["country"] = geo_data.get("country", "Unknown")
	domain_info["city"] = geo_data.get("city", "Unknown")
	domain_info["latitude"] = geo_data.get("lat", 0)
	domain_info["longitude"] = geo_data.get("lon", 0)
	domain_info["organization"] = geo_data.get("org", "Unknown") or geo_data.get("isp", "Unknown")
	domain_info["region"] = geo_data.get("regionName", "Unknown")
	domain_info["timezone"] = geo_data.get("timezone", "Unknown")
	domain_info["as"] = geo_data.get("as", "Unknown")
	logger.info(f"Retrieved geolocation data for {ip_address}: {geo_data}")
	else:
	logger.warning(f"Failed to get geolocation data: {geo_data}")
	# Fall back to default coordinates if geolocation fails
	domain_info["latitude"] = 40.7128 # Default latitude (New York)
	domain_info["longitude"] = -74.0060 # Default longitude (New York)
	else:
	logger.warning(f"Failed to get geolocation data, status code: {geo_response.status_code}")
	# Fall back to default coordinates if geolocation fails
	domain_info["latitude"] = 40.7128
	domain_info["longitude"] = -74.0060
	except Exception as geo_error:
	logger.error(f"Error getting geolocation data: {geo_error}")
	# Fall back to default coordinates if geolocation fails
	domain_info["latitude"] = 40.7128
	domain_info["longitude"] = -74.0060

	except socket.gaierror:
	domain_info["ip_address"] = "Could not resolve"

	return domain_info

	except Exception as e:
	logger.error(f"Error getting domain info: {e}")
	return {
	"domain": urlparse(url).netloc,
	"error": str(e),
	"ip_address": "Error",
	"organization": "Unknown",
	"country": "Unknown",
	"latitude": 0,
	"longitude": 0
	}

	def check_ssl_certificate(domain):
	"""
	Check SSL certificate information for a domain

	Args:
	domain: Domain to check SSL for

	Returns:
	dict: SSL certificate information
	"""
	ssl_info = {
	"has_ssl": False,
	"issuer": "Unknown",
	"valid_from": "Unknown",
	"valid_until": "Unknown",
	"days_until_expiry": 0
	}

	try:
	# Try to connect with TLS/SSL
	context = ssl.create_default_context()
	with socket.create_connection((domain, 443), timeout=5) as sock:
	with context.wrap_socket(sock, server_hostname=domain) as ssock:
	# Get certificate
	cert = ssock.getpeercert()
	ssl_info["has_ssl"] = True

	# Extract certificate details
	if cert:
	# Get issuer
	issuer = dict(x[0] for x in cert['issuer'])
	ssl_info["issuer"] = issuer.get('organizationName', 'Unknown')

	# Get validity dates
	ssl_info["valid_from"] = cert.get('notBefore', 'Unknown')
	ssl_info["valid_until"] = cert.get('notAfter', 'Unknown')

	# Calculate days until expiry
	if ssl_info["valid_until"] != 'Unknown':
	expiry_date = datetime.strptime(ssl_info["valid_until"], '%b %d %H:%M:%S %Y %Z')
	days_until_expiry = (expiry_date - datetime.now()).days
	ssl_info["days_until_expiry"] = max(0, days_until_expiry)
	except Exception as e:
	ssl_info["error"] = str(e)

	return ssl_info

	def extract_whois_features(domain):
	"""Extract features from WHOIS data for a domain"""
	whois_features = {
	"domain_age_days": 0,
	"expiration_remaining_days": 0,
	"recently_registered": 0,
	"privacy_protected": 0,
	"suspicious_registrar": 0
	}

	if not whois_available:
	return whois_features

	try:
	w = whois.whois(domain)

	# Calculate domain age
	if w.creation_date:
	creation_date = w.creation_date
	if isinstance(creation_date, list):
	creation_date = creation_date[0]
	domain_age = (datetime.now() - creation_date).days
	whois_features["domain_age_days"] = domain_age
	whois_features["recently_registered"] = 1 if domain_age < 60 else 0

	# Calculate expiration time
	if w.expiration_date:
	expiry_date = w.expiration_date
	if isinstance(expiry_date, list):
	expiry_date = expiry_date[0]
	days_until_expiry = (expiry_date - datetime.now()).days
	whois_features["expiration_remaining_days"] = max(0, days_until_expiry)

	# Check for privacy protection
	if w.registrar and "privacy" in str(w.registrar).lower():
	whois_features["privacy_protected"] = 1

	# Check for suspicious registrars
	suspicious_registrars = ["namecheap", "namesilo", "porkbun"]
	if w.registrar and any(r in str(w.registrar).lower() for r in suspicious_registrars):
	whois_features["suspicious_registrar"] = 1

	return whois_features
	except Exception as e:
	logger.error(f"Error getting WHOIS data: {e}")
	return whois_features

	def extract_ct_log_features(domain):
	"""Extract features from Certificate Transparency logs"""
	ct_features = {
	"cert_count": 0,
	"recent_cert_count": 0,
	"suspicious_cert_pattern": 0
	}

	try:
	# Use crt.sh API to check certificate history
	response = requests.get(f"https://crt.sh/?q={domain}&output=json", timeout=5)
	if response.status_code == 200:
	try:
	certs = response.json()

	# Total certificates
	ct_features["cert_count"] = len(certs)

	# Recent certificates (last 30 days)
	thirty_days_ago = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
	recent_certs = [c for c in certs if c.get("not_before", "") > thirty_days_ago]
	ct_features["recent_cert_count"] = len(recent_certs)

	# Check for suspicious patterns in certificate names
	for cert in certs:
	common_name = cert.get("common_name", "").lower()
	if any(p in common_name for p in ["secure", "login", "banking", "verify"]):
	ct_features["suspicious_cert_pattern"] = 1
	break
	except json.JSONDecodeError:
	logger.warning("Failed to parse certificate data as JSON")

	return ct_features
	except Exception as e:
	logger.error(f"Error getting certificate data: {e}")
	return ct_features

	def extract_content_features(url, html_content=None):
	"""Extract features from webpage content"""
	content_features = {
	"page_size_bytes": 0,
	"external_resources_count": 0,
	"form_count": 0,
	"password_field_count": 0,
	"js_to_html_ratio": 0,
	"title_brand_mismatch": 0,
	"favicon_exists": 0,
	"similar_domain_redirect": 0
	}

	if not BeautifulSoup_available:
	return content_features

	try:
	# Get the HTML content if not provided
	if html_content is None:
	try:
	response = requests.get(url, timeout=10,
	headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
	html_content = response.text
	content_features["page_size_bytes"] = len(html_content)
	except Exception as req_error:
	logger.error(f"Error fetching HTML content: {req_error}")
	return content_features

	# Parse HTML
	soup = BeautifulSoup(html_content, 'html.parser')

	# Count forms and password fields
	content_features["form_count"] = len(soup.find_all("form"))
	content_features["password_field_count"] = len(soup.find_all("input", {"type": "password"}))

	# External resources
	external_resources = 0
	parsed_url = urlparse(url)
	base_domain = parsed_url.netloc

	for tag in soup.find_all(["script", "img", "iframe", "link"], src=True):
	src = tag.get("src", "")
	if src and not src.startswith(('/', '#', 'data:')):
	if base_domain not in src:
	external_resources += 1

	for tag in soup.find_all("link", href=True):
	href = tag.get("href", "")
	if href and not href.startswith(('/', '#', 'data:')):
	if base_domain not in href:
	external_resources += 1

	content_features["external_resources_count"] = external_resources

	# JS to HTML ratio
	js_content = 0
	for script in soup.find_all("script"):
	if script.string:
	js_content += len(script.string)

	if len(html_content) > 0:
	content_features["js_to_html_ratio"] = js_content / len(html_content)

	# Title brand mismatch
	if soup.title and soup.title.string:
	title = soup.title.string.lower()
	domain_parts = base_domain.lower().split(".")
	brand_name = domain_parts[0] if domain_parts[0] != "www" else domain_parts[1]

	if title and brand_name not in title:
	content_features["title_brand_mismatch"] = 1

	# Check for favicon
	if soup.find("link", rel="icon") or soup.find("link", rel="shortcut icon"):
	content_features["favicon_exists"] = 1

	# Check for redirects to similar domains
	meta_refresh = soup.find("meta", {"http-equiv": "refresh"})
	if meta_refresh and "content" in meta_refresh.attrs:
	content = meta_refresh["content"]
	if "url=" in content.lower():
	redirect_url = content.split("url=")[1].strip()
	redirect_domain = urlparse(redirect_url).netloc

	# Check if redirect domain is similar but different
	similarity = SequenceMatcher(None, base_domain, redirect_domain).ratio()
	if 0.6 < similarity < 0.9: # Similar but not identical
	content_features["similar_domain_redirect"] = 1

	return content_features
	except Exception as e:
	logger.error(f"Error extracting content features: {e}")
	return content_features

	def extract_nlp_features(domain):
	"""Extract NLP-based features from the domain name"""
	nlp_features = {
	"character_distribution": 0,
	"vowel_consonant_ratio": 0,
	"contains_digits": 0,
	"contains_repeated_chars": 0,
	"ngram_score": 0,
	"word_length_avg": 0
	}

	try:
	# Remove TLD for analysis
	domain_parts = domain.split('.')
	domain_without_tld = '.'.join(domain_parts[:-1]) if len(domain_parts) > 1 else domain_parts[0]

	# Character distribution (normalized entropy)
	entropy = calculate_entropy(domain_without_tld)
	nlp_features["character_distribution"] = entropy / 4.7 # Normalize, 4.7 is max entropy for English text

	# Vowel to consonant ratio
	vowels = sum(c.lower() in 'aeiou' for c in domain_without_tld)
	consonants = sum(c.lower() in 'bcdfghjklmnpqrstvwxyz' for c in domain_without_tld)
	nlp_features["vowel_consonant_ratio"] = vowels / consonants if consonants > 0 else 0

	# Contains digits
	nlp_features["contains_digits"] = 1 if any(c.isdigit() for c in domain_without_tld) else 0

	# Contains repeated characters (3 or more)
	if re.search(r'(.)\1{2,}', domain_without_tld):
	nlp_features["contains_repeated_chars"] = 1

	# N-gram probability score (approximated)
	common_english_bigrams = ["th", "he", "in", "er", "an", "re", "on", "at", "en", "nd", "ti", "es", "or"]
	bigram_count = sum(domain_without_tld.lower().count(bigram) for bigram in common_english_bigrams)
	domain_length = len(domain_without_tld)
	nlp_features["ngram_score"] = bigram_count / (domain_length - 1) if domain_length > 1 else 0

	# Average word length if domain has words
	words = re.findall(r'[a-zA-Z]+', domain_without_tld)
	if words:
	avg_word_length = sum(len(word) for word in words) / len(words)
	nlp_features["word_length_avg"] = avg_word_length

	return nlp_features
	except Exception as e:
	logger.error(f"Error extracting NLP features: {e}")
	return nlp_features

	def extract_reputation_features(domain, ip_address):
	"""Extract reputation-based features from various sources"""
	reputation_features = {
	"domain_age_category": 0, # 0: unknown, 1: new, 2: medium, 3: established
	"ip_blacklisted": 0,
	"domain_blacklisted": 0,
	"suspicious_tld_category": 0,
	"suspicious_country": 0
	}

	try:
	# Domain age categorization (if whois is available)
	if whois_available:
	try:
	w = whois.whois(domain)
	if w.creation_date:
	creation_date = w.creation_date
	if isinstance(creation_date, list):
	creation_date = creation_date[0]
	domain_age_days = (datetime.now() - creation_date).days

	if domain_age_days < 30:
	reputation_features["domain_age_category"] = 1 # New
	elif domain_age_days < 180:
	reputation_features["domain_age_category"] = 2 # Medium
	else:
	reputation_features["domain_age_category"] = 3 # Established
	except Exception as whois_error:
	logger.warning(f"Whois error for reputation features: {whois_error}")

	# Check for blacklisted IP (simplified - would use an actual API)
	high_risk_countries = ["RU", "CN", "IR", "KP", "NG"]
	suspicious_asn_orgs = ["Cloudflare", "OVH", "DigitalOcean", "Amazon"]

	# Get IP geolocation
	if ip_address and ip_address != "Unknown" and ip_address != "Could not resolve":
	try:
	geo_response = requests.get(f"http://ip-api.com/json/{ip_address}", timeout=5)
	if geo_response.status_code == 200:
	geo_data = geo_response.json()
	if geo_data.get("status") == "success":
	# Check country risk
	if geo_data.get("countryCode") in high_risk_countries:
	reputation_features["suspicious_country"] = 1

	# Check ASN risk
	asn_org = geo_data.get("org", "").lower()
	if any(org.lower() in asn_org for org in suspicious_asn_orgs):
	reputation_features["ip_blacklisted"] = 0.5 # Partial flag
	except Exception as geo_error:
	logger.warning(f"Error getting geolocation for reputation: {geo_error}")

	# Check TLD risk category
	tld = domain.split('.')[-1] if '.' in domain else ''
	high_risk_tlds = ['tk', 'ml', 'ga', 'cf', 'gq', 'xyz', 'top', 'icu', 'rest', 'zip']
	medium_risk_tlds = ['online', 'site', 'club', 'live', 'vip', 'fit', 'pw']

	if tld in high_risk_tlds:
	reputation_features["suspicious_tld_category"] = 2
	elif tld in medium_risk_tlds:
	reputation_features["suspicious_tld_category"] = 1

	return reputation_features
	except Exception as e:
	logger.error(f"Error extracting reputation features: {e}")
	return reputation_features

	def analyze_url(url):
	"""
	Comprehensive URL analysis function that combines multiple checks

	Args:
	url: URL to analyze

	Returns:
	dict: Comprehensive analysis result
	"""
	logger.info(f"Analyzing URL: {url}")

	# Ensure URL has a scheme
	if not url.startswith(('http://', 'https://')):
	url = 'http://' + url
	logger.info(f"Added scheme to URL: {url}")

	try:
	# Extract features and make prediction
	features, feature_vector = extract_features(url)
	prediction_result = predict_with_model(url)

	# Get suspicious patterns
	suspicious_patterns = check_suspicious_patterns(url)

	# Check HTML security
	html_security = check_html_security(url)

	# Parse URL components for display
	parsed_url = urlparse(url)
	domain = parsed_url.netloc
	scheme = parsed_url.scheme

	# Get domain information if available
	domain_info = get_domain_info(url)

	# Create comprehensive analysis result
	result = {
	"status": "success",
	"url": url,
	"domain": domain,
	"protocol": scheme,
	"analysis_date": datetime.now().isoformat(),
	"score": prediction_result.get("score", 0),
	"fraud_score": prediction_result.get("score", 0), # Duplicate for UI compatibility
	"risk_level": prediction_result.get("risk_level", "unknown"),
	"is_suspicious": prediction_result.get("score", 0) > 50,
	"suspicious_patterns": suspicious_patterns,
	"html_security": html_security,
	"risk_factors": prediction_result.get("risk_factors", {}),
	"feature_values": features,
	"domain_info": domain_info,
	"feature_contributions": prediction_result.get("feature_contributions", []),
	"feature_table": prediction_result.get("feature_table", []),
	"section_totals": prediction_result.get("section_totals", {})
	}

	# Ensure section totals are set using fixed weights if missing
	if not result["section_totals"]:
	score = result["score"]
	result["section_totals"] = {
	"Key Risk Factors": round(0.4 * score, 1), # URL features (40%)
	"Domain Information": round(0.1 * score, 1), # Domain information (10%)
	"Suspicious Patterns": round(0.5 * score, 1) # Suspicious patterns + HTML content (50%)
	}

	# Special handling for trusted domains - reduce Suspicious Patterns section score
	# when no actual suspicious patterns were found
	parsed_url = urlparse(url)
	domain = parsed_url.netloc.lower()

	# If no suspicious patterns were found, set that section to 0%
	# regardless of whether it's a trusted domain or not
	if not suspicious_patterns:
	# Set Suspicious Patterns to 0% since none were found
	original_suspicious_patterns_score = result["section_totals"]["Suspicious Patterns"]
	result["section_totals"]["Suspicious Patterns"] = 0.0

	# Recalculate overall score by removing the suspicious patterns contribution
	original_score = result["score"]

	# When Suspicious Patterns is set to 0, recalculate the total score
	# by considering only the remaining sections (Key Risk Factors + Domain Information)
	key_risk_score = result["section_totals"]["Key Risk Factors"]
	domain_info_score = result["section_totals"]["Domain Information"]

	# Set the adjusted score to be just the sum of the remaining sections
	adjusted_score = key_risk_score + domain_info_score

	# Update the overall score
	result["score"] = adjusted_score
	result["fraud_score"] = adjusted_score
	result["risk_level"] = get_risk_level(adjusted_score)
	logger.info(f"Adjusted score due to no suspicious patterns: {original_score} -> {adjusted_score}")

	# Add SSL info if available
	try:
	ssl_info = check_ssl_certificate(domain)
	result["ssl_info"] = ssl_info
	except Exception as e:
	logger.warning(f"Unable to check SSL certificate: {str(e)}")
	result["ssl_info"] = {"error": str(e)}

	logger.info(f"Analysis complete for {url} - Risk score: {result['score']}")
	return result
	except Exception as e:
	logger.error(f"Error analyzing URL: {str(e)}")
	logger.error(traceback.format_exc())
	return {
	"status": "error",
	"url": url,
	"message": f"Error analyzing URL: {str(e)}",
	"error": str(e),
	"traceback": traceback.format_exc(),
	"domain_info": get_domain_info(url),
	"suspicious_patterns": check_suspicious_patterns(url)
	}

	@app.route("/")
	def home():
	logger.info("Home route accessed")
	try:
	return render_template("index.html")
	except Exception as e:
	logger.error(f"Error rendering index.html: {e}")
	return f"Error: {str(e)}", 500

	@app.route("/about")
	def about():
	return render_template("about.html")

	@app.route("/features")
	def features():
	return render_template("features.html")

	@app.route("/health-check")
	def health_check():
	"""Health check endpoint for the integrated application"""
	return jsonify({
	"status": "healthy",
	"message": "Integrated Flask app is running",
	"model_loaded": get_model_instance() is not None,
	"scaler_loaded": get_scaler_instance() is not None
	})

	@app.route("/predict", methods=["POST", "OPTIONS"])
	def predict():
	# Handle CORS preflight requests
	if request.method == 'OPTIONS':
	response = jsonify({'status': 'success'})
	response.headers.add('Access-Control-Allow-Origin', '*')
	response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
	response.headers.add('Access-Control-Allow-Methods', 'POST,OPTIONS')
	return response

	if request.method == 'POST':
	try:
	# Log request headers for debugging
	logger.info(f"Request headers: {dict(request.headers)}")
	logger.info(f"Request content type: {request.content_type}")
	logger.info(f"Raw request data: {request.data.decode('utf-8', errors='replace') if request.data else 'None'}")

	# Extract URL from request
	url = None

	# Try different methods to extract the URL
	if request.is_json:
	data = request.get_json(force=True)
	logger.info(f"JSON data: {data}")
	url = data.get('url', '')
	elif request.form:
	logger.info(f"Form data: {dict(request.form)}")
	url = request.form.get('url', '')
	elif request.data:
	try:
	data = json.loads(request.data.decode('utf-8'))
	logger.info(f"Parsed JSON from raw data: {data}")
	url = data.get('url', '')
	except json.JSONDecodeError as e:
	logger.error(f"Failed to parse raw data as JSON: {e}")

	logger.info(f"Extracted URL: {url}")

	if not url or len(url.strip()) == 0:
	logger.error("No URL provided in request")
	return jsonify({
	"status": "error",
	"message": "No URL provided",
	"details": "Please enter a valid URL to analyze"
	}), 400

	# Ensure URL has a scheme
	if not url.startswith(('http://', 'https://')):
	url = 'http://' + url
	logger.info(f"Added http:// prefix to URL: {url}")

	# Process the URL directly without backend API call
	logger.info("Processing prediction request directly")

	# Extract features
	features, feature_vector = extract_features(url)

	# Get prediction
	result = predict_with_model(url, features)

	# For debugging the feature display issue
	logger.info(f"Feature contributions: {result.get('feature_contributions', [])}")

	# Explicitly add this field for the UI
	if 'feature_table' not in result:
	result['feature_table'] = []
	# Add entries to feature_table if feature_contributions exists
	if 'feature_contributions' in result and result['feature_contributions']:
	for contrib in result['feature_contributions']:
	result['feature_table'].append({
	'feature': contrib['name'],
	'value': contrib['value'],
	'impact': contrib['contribution'] * 100 # Convert to percentage
	})

	# Sort feature_table: non-zero values in ascending order, zero values at the bottom
	result['feature_table'] = sorted(
	result['feature_table'],
	key=lambda x: (x['value'] == 0, x['value'])
	)

	logger.info(f"Feature table: {result.get('feature_table', [])}")
	logger.info(f"Prediction result: {result}")
	return jsonify(result)
	except Exception as e:
	logger.error(f"Unexpected error in predict route: {e}")
	logger.error(traceback.format_exc())
	return jsonify({
	"status": "error",
	"message": "An unexpected error occurred",
	"details": str(e)
	}), 500

	@app.route("/login", methods=['GET', 'POST'])
	def login():
	if request.method == 'POST':
	username = request.form.get('username')
	password = request.form.get('password')

	# Just simulate successful login since we're not connecting to a real DB
	session['user_id'] = 1
	session['username'] = username
	flash('Login successful', 'success')
	return redirect(url_for('home'))

	return render_template('weblogin.html')

	@app.route('/register', methods=['GET', 'POST'])
	def register():
	if request.method == 'POST':
	username = request.form.get('username')
	email = request.form.get('email')

	# Just simulate successful registration
	flash('Registration successful! Please log in.', 'success')
	return redirect(url_for('login'))

	return render_template('weblogin.html', register=True)

	@app.route('/logout')
	def logout():
	session.clear()
	flash('You have been logged out successfully!', 'success')
	return redirect(url_for('home'))

	@app.route('/dashboard')
	def dashboard():
	"""User dashboard page"""
	return render_template('dashboard.html')

	@app.route("/analyze", methods=['GET', 'POST', 'OPTIONS'])
	def analyze():
	"""
	Generate analysis report for a URL.
	Forward the request to the backend instead of handling it directly.
	"""
	# Handle CORS preflight requests
	if request.method == 'OPTIONS':
	response = jsonify({'status': 'success'})
	response.headers.add('Access-Control-Allow-Origin', '*')
	response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
	response.headers.add('Access-Control-Allow-Methods', 'GET,POST,OPTIONS')
	return response

	# Get the requested format (pdf or json)
	report_format = request.args.get('format', '').lower()

	# Extract URL from request
	url = None
	if request.method == 'POST':
	if request.is_json:
	data = request.get_json(force=True)
	url = data.get('url', '')
	elif request.form:
	url = request.form.get('url', '')
	elif request.data:
	try:
	data = json.loads(request.data.decode('utf-8'))
	url = data.get('url', '')
	except json.JSONDecodeError:
	pass
	else: # GET request
	url = request.args.get('url', '')

	if not url or len(url.strip()) == 0:
	return jsonify({
	"status": "error",
	"message": "No URL provided",
	"details": "Please enter a valid URL to analyze"
	}), 400

	# Ensure URL has a scheme
	if not url.startswith(('http://', 'https://')):
	url = 'http://' + url

	try:
	# Forward the request to the backend API
	backend_url = os.environ.get('BACKEND_URL', 'http://localhost:5000').rstrip('/') + '/analyze'

	# Prepare the request parameters
	params = {}
	if report_format:
	params['format'] = report_format

	# Send the request to the backend
	logger.info(f"Forwarding analyze request to backend: {backend_url}")

	if request.method == 'POST':
	response = requests.post(
	backend_url,
	json={"url": url},
	params=params,
	headers={"Content-Type": "application/json"}
	)
	else: # GET request
	response = requests.get(
	backend_url,
	params={"url": url, **params}
	)

	# Check if the response was successful
	if response.status_code == 200:
	# Try to parse the response as JSON
	try:
	result = response.json()
	return jsonify(result)
	except:
	# If we couldn't parse as JSON, return the raw response
	return response.text, 200, {'Content-Type': 'text/html'}
	else:
	# If the backend returned an error, log it and fall back to local analysis
	logger.warning(f"Backend returned error {response.status_code}: {response.text}")
	logger.info("Using local analysis as fallback")

	# Fall back to local implementation using analyze_url
	analysis_result = analyze_url(url)
	return jsonify(analysis_result)

	except requests.RequestException as e:
	logger.error(f"Error connecting to backend API: {e}")
	logger.info("Using local analysis as fallback")

	# Fall back to local implementation using analyze_url
	analysis_result = analyze_url(url)
	return jsonify(analysis_result)

	except Exception as e:
	logger.error(f"Error generating analysis: {e}")
	logger.error(traceback.format_exc())
	return jsonify({
	"status": "error",
	"message": "Failed to generate analysis",
	"details": str(e)
	}), 500

	@app.route("/test")
	def test():
	"""Test route to verify the Flask app is running properly"""
	return jsonify({
	"status": "success",
	"message": "Integrated Flask app is running successfully!",
	"model_loaded": get_model_instance() is not None,
	"scaler_loaded": get_scaler_instance() is not None
	})

	@app.route('/diagnostic')
	def diagnostic_page():
	"""Serve the diagnostic page to test functionality"""
	return render_template('diagnostic.html')

	@app.route('/model-status', methods=['GET'])
	def model_status():
	"""Check the status of the model"""
	status = {
	"model_loaded": get_model_instance() is not None,
	"scaler_loaded": get_scaler_instance() is not None,
	"status": "operational" if get_model_instance() is not None and get_scaler_instance() is not None else "error",
	"model_type": str(type(get_model_instance())) if get_model_instance() else "None",
	"using_fallback": hasattr(get_model_instance(), 'summary') and get_model_instance().summary() == "Fallback model (SimpleModel)"
	}
	return jsonify(status)

	@app.route('/debug', methods=['GET'])
	def debug():
	"""Debug endpoint showing environment and configuration"""
	debug_info = {
	"environment": {k: v for k, v in os.environ.items() if not k.startswith("_") and not "TOKEN" in k and not "SECRET" in k},
	"model_path": os.environ.get('MODEL_FILE', 'models/fraud_detection_model.h5'),
	"model_loaded": get_model_instance() is not None,
	"scaler_loaded": get_scaler_instance() is not None,
	"model_type": str(type(get_model_instance())) if get_model_instance() else "None"
	}
	return jsonify(debug_info)

	# Function to fix dtype policy in model config
	def fix_dtype_policy(config):
	"""Fix issues with DTypePolicy deserialization"""
	if isinstance(config, dict):
	# Replace dtype objects with string representation
	if 'dtype' in config and isinstance(config['dtype'], dict) and config['dtype'].get('class_name') == 'DTypePolicy':
	config['dtype'] = 'float32'

	# Recursively process nested configs
	for key, value in config.items():
	if isinstance(value, dict):
	config[key] = fix_dtype_policy(value)
	elif isinstance(value, list):
	config[key] = [fix_dtype_policy(item) if isinstance(item, (dict, list)) else item for item in value]

	elif isinstance(config, list):
	config = [fix_dtype_policy(item) if isinstance(item, (dict, list)) else item for item in config]

	return config

	def safe_decode_model_config(raw_config):
	"""Safely decode model configuration to handle any version compatibility issues."""
	try:
	# Parse the raw model config
	config = json.loads(raw_config)

	# Apply fixes to the config
	config = fix_dtype_policy(config)

	# Re-encode as JSON string
	return json.dumps(config)
	except Exception as e:
	logger.error(f"Error processing model config: {e}")
	# Return original if processing failed
	return raw_config

	def build_compatible_model(model_path):
	"""Build a compatible model manually from the H5 file."""
	try:
	# Open the H5 file
	with h5py.File(model_path, 'r') as h5file:
	# Check if the model config exists
	if 'model_config' in h5file.attrs:
	# Get the model config as a JSON string
	model_config = h5file.attrs['model_config']

	# Fix compatibility issues in the config
	fixed_config = safe_decode_model_config(model_config)

	# Create a model from the fixed config
	model = tf.keras.models.model_from_json(
	fixed_config,
	custom_objects={
	'InputLayer': CompatibleInputLayer,
	'FairnessConstraint': tf.keras.constraints.UnitNorm,
	'FairnessPenalty': tf.keras.layers.Layer
	}
	)

	# Load weights
	model.load_weights(model_path)

	logger.info("Built compatible model manually from H5 file")
	return model
	else:
	logger.error("No model config found in H5 file")
	return None
	except Exception as e:
	logger.error(f"Error building compatible model: {e}")
	return None

	@app.route('/debug-connection', methods=['GET', 'POST'])
	def debug_connection():
	"""Debugging endpoint for connection issues"""
	try:
	if request.method == 'POST':
	# Echo back the request data
	data = request.get_json() if request.is_json else {}

	# Add additional debugging info
	response_data = {
	"status": "success",
	"message": "Connection is working",
	"timestamp": datetime.now().isoformat(),
	"request_data": data,
	"request_headers": dict(request.headers),
	"content_type": request.content_type,
	"method": request.method,
	"environment": {
	"python_version": sys.version,
	"flask_version": flask.__version__,
	"tensorflow_version": tf.__version__
	}
	}

	return jsonify(response_data)
	else:
	# Simple GET response for connection testing
	return jsonify({
	"status": "success",
	"message": "Connection is working",
	"timestamp": datetime.now().isoformat()
	})
	except Exception as e:
	logger.error(f"Error in debug-connection endpoint: {e}")
	return jsonify({
	"status": "error",
	"message": str(e),
	"traceback": traceback.format_exc()
	}), 500

	# Run the app - modified for HuggingFace Spaces compatibility
	if __name__ == "__main__":
	# For HuggingFace Spaces, we need to listen on 0.0.0.0:7860
	port = int(os.environ.get('PORT', 7860))
	app.run(host='0.0.0.0', port=port, debug=False)
	# Preload model to avoid cold start issues
	try:
	model = get_model_instance()
	scaler = get_scaler_instance()
	logger.info("Model and scaler preloaded successfully")
	except Exception as e:
	logger.error(f"Error preloading model: {str(e)}")