Spaces:

riyadahmadov
/

Fastag_Fraud_Detection

Sleeping

App Files Files Community

Fastag_Fraud_Detection / FastagFraudDetectionProject.R

riyadahmadov

Upload FastagFraudDetectionProject.R

656aee0 verified about 1 month ago

raw

history blame

No virus

20.5 kB

	############################### R Project #################################

	######################### Fastag Fraud Detection ###########################

	## 1) Descriptive analysis
	## 2) Data Preprocessing
	## 3) Data Visualization
	## 4) Model Development
	## 5) Create Model Pipeline
	## 6) Model Deployment


	## 1) Descriptive analysis

	# Let's load dataset
	df <- read.csv('FastagFraudDetection.csv')

	# Let's look at first 5 row
	head(df)

	# Let's look at dataframe with view
	View(df)

	# Let's look at dataframe info
	str(df)

	# Let's look at summary of dataframe
	View(summary(df))

	# Dimensions (rows and columns)
	dim(df)
	nrow(df)
	ncol(df)

	# Let's look at column names
	names(df)
	colnames(df)

	# Let's look at target column
	tail(df[,'Fraud_indicator'])
	df[['Fraud_indicator']]
	df$Fraud_indicator

	# Let's look at vehicle columns
	df[, c("Vehicle_Dimensions", "Vehicle_Speed",'Vehicle_Plate_Number','Vehicle_Type')]

	# For select a first row
	df[1, ]

	# For select first three rows
	df[1:3,]

	# Let's filter rows where vehicle type is bus
	head(df[df$Vehicle_Type == "Bus ", ],n = 2)

	# Let's use subset function
	nrow(subset(df, Transaction_Amount > 300))

	# Let's look at count of null values over each columns
	null_counts <- colSums(is.na(df))
	null_counts

	# Okey, missing data is not in dataframe :)


	## 2) Data Preprocessing

	# Let's divide timestamp column to time and date
	df[['Date','Time']] <- df$Timestamp.split()

	# Load necessary libraries
	#install.packages("tidyverse")
	#library(tidyverse)

	# Let's check for rows with infinite values
	rows_with_inf <- apply(df, 1, function(row) any(is.infinite(row)))
	View(df[rows_with_inf, ])

	# Let's split Timestamp into Date and Time
	df <- df %>%
	separate(Timestamp, into = c("Date", "Time"), sep = " ") #-- Okey, Great

	# Let's do great only first letter of words of vehicle type
	df$Vehicle_Type <- str_to_title(df$Vehicle_Type)

	# Let's remove fastag id
	df <- df[, !colnames(df) %in% c('FastagID')]

	# Let's group transaction amount and amount paid column
	group <- function(x) {
	ifelse(x < 100, '<100',
	ifelse(x < 200, '100-200',
	ifelse(x < 300, '200-300', '300+')))
	}

	# Create a new column with transaction amount groups
	df$Transaction_Amount_Group <- group(df$Transaction_Amount)


	# SOLUTION 2:

	# Let's define the amount paid groups
	cut_points <- c(0, 100, 200, 300, Inf)
	group_names <- c('<100', '100-200', '200-300', '300+')

	# Let's create a new column with amount paid groups
	df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)

	# Let's define the vehicle speed groups
	cut_points <- c(0, 30, 60, 80 , 100 , Inf)
	group_names <- c('<30', '30-60', '60-80', '80-100','100+')

	# Let's create a new column with transaction amount groups
	df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)

	# Let's find ratio transaction amount and amount paid
	divide <- function(x) {
	ifelse(is.na(x) \| x == 0,
	0,
	round(df$Amount_paid / x, digits = 3)
	)
	}

	df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)

	# Let's seperate Geographical_Location column to long and lat
	df <- df %>%
	separate(Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")

	# Let's define fraud column as numeric for calculate process
	df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)

	# Let's look at columns
	colnames(df)

	# Let's add new columns whether weekend or weekday
	df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
	df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)

	# Let's create column and add day of week
	df$weekdays <- weekdays(df$Date)

	# Let's create pm and am clock
	df$Time1 <- as.POSIXct(df$Time, format = "%H:%M")
	df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
	df <- df[, !colnames(df) %in% c("Time1")]

	## 3) Data Visualization

	# • Scatter Plot
	plot(df$Transaction_Amount, df$Vehicle_Speed, main = "Transaction Amount & Vehicle Speed", xlab = "Amount", ylab = "Speed", col = "darkgreen", pch = 1, cex = 1.2, font.main = 9)

	# • Histogram
	hist(df$Amount_paid, main = "Amount Paid Group", xlab = "Groups", ylab = "Frequency", col = "green", border = "blue", font.main = 3)

	# • Bar Plot
	barplot(table(df$Vehicle_Type), main = "Frequency of Vehicle Types", xlab = "Vehicle Type", ylab = "Frequency", col = "blue")
	text(x = 1:length(counts), y = counts + 11, labels = counts, pos = 3, cex = 0.9, col = "black", xpd = TRUE)

	# • Bar Plot
	lane_type_avg <- aggregate(Fraud_Number ~ Lane_Type, data = df, mean)
	barplot(lane_type_avg$Fraud_Number, main = "Frequency of Fraud by Lane Type", xlab = "Lane Type",ylab = "Frequency", col = "blue", names.arg = lane_type_avg$Lane_Type)

	# • Box Plot
	boxplot(df$Transaction_Amount, df$Amount_paid, names = c("Transaction Amount", "Amount Paid"), main = "Transaction Amount & Amount Paid", col = c("blue", "green"), border = "black", font.main = 3)
	boxplot(df$Vehicle_Speed, names = c("Speed"), main = "Vehicle Speed", col = c("green"), border = "black", font.main = 3)
	boxplot(df$Transaction_Amount_Ratio, names = c("Ratio"), main = "Transaction Amount Ratio", col = c("red"), border = "blue", font.main = 3)

	# • Pie Chart
	pie(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)

	# • Pie Chart
	#install.packages("plotrix")
	#library("plotrix")
	pie3D(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)

	# • Line Plot
	library("tidyverse")
	df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
	daily_sum <- aggregate(Transaction_Amount ~ Date, data = df, sum)
	plot(daily_sum$Date, daily_sum$Transaction_Amount, type = "l",
	main = "Daily Sum of Transaction Amount",
	xlab = "Date", ylab = "Sum of Transaction Amount",
	col = "red", lwd = 2, font.main = 2)

	# Let's extract analyze by month
	df$MonthName <- format(df$Date, "%B")
	monthly_avg <- aggregate(Transaction_Amount ~ MonthName, data = df, mean)

	monthly_avg$MonthName <- factor(monthly_avg$MonthName,
	levels = c("Yanvar", "Fevral", "Mart", "Aprel", "May", "İyun",
	"İyul", "Avqust", "Sentyabr", "Oktyabr", "Noyabr", "Dekabr"),
	ordered = TRUE)

	plot(monthly_avg$MonthName, monthly_avg$Transaction_Amount, type = "l",
	main = "Daily Sum of Transaction Amount",
	xlab = "Date", ylab = "Sum of Transaction Amount",
	col = "red", lwd = 2, font.main = 2)

	# • Scatter with ggplot2 Plot
	library(ggplot2)
	ggplot(df, aes(x = df$Transaction_ID, y = df$Transaction_Amount)) +
	geom_point(color = "blue", size = 3) +
	labs(title = "Transaction Id & Amount", x = "Transaction Id", y = "Amount") +
	theme_bw()

	# • Bar plot with ggplot2
	vehicle_type_avg <- aggregate(Amount_paid ~ Vehicle_Type, data = df, mean)

	ggplot(vehicle_type_avg, aes(x = Vehicle_Type, y = Amount_paid)) +
	geom_bar(stat = "identity", fill = "orange", color = "black") +
	labs(title = "Average Amount Paid by Vehicle Type", x = "Vehicle Type", y = "Average Amount Paid") +
	theme_bw()


	# Let's find transaction counts by longitude and latitude
	transaction_counts <- aggregate(Transaction_ID ~ Longitude + Latitude, data = df, FUN = length)

	# Plot the map
	install.packages("maps")
	library(maps)

	world <- map_data("world")
	transaction_counts$Longitude <- as.numeric(as.character(transaction_counts$Longitude))
	transaction_counts$Latitude <- as.numeric(as.character(transaction_counts$Latitude))

	# Plot the map
	ggplot() +
	geom_polygon(data = world, aes(x = long, y = lat, group = group), fill = "lightgray", color = "black") +
	geom_point(data = transaction_counts, aes(x = Longitude, y = Latitude, size = Transaction_ID), color = "red") +
	labs(title = "Transaction Count by Location", x = "Longitude", y = "Latitude", size = "Transaction Count") +
	theme_minimal()


	## 4) Model Development

	# Let's analyze needs : Analyzing Fastag fraud involves identifying
	# patterns and trends in fraudulent transactions to enhance system
	# security and user trust. By leveraging data analytics and machine
	# learning, businesses can predict and prevent future fraudulent activities.
	# This proactive approach helps mitigate financial losses and ensures
	# the integrity of the Fastag system. Ultimately, maintaining a secure
	# and reliable Fastag system promotes user satisfaction and supports
	# efficient toll collection.

	# Let's remove unnecessary columns from dataframe

	df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID")]
	# df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]

	# Let's dummy some columns
	library(dplyr)
	cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
	"Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")

	# Creating dummy variables
	df_dummies <- df %>%
	select(all_of(cols_to_dummy)) %>%
	model.matrix(~ . - 1, data = .) %>%
	as.data.frame()

	# Combining the dummies with the original dataframe excluding the original columns
	df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)

	# install.packages("randomForest")
	library(randomForest)
	library(caret)

	# Split the data into training and testing sets
	set.seed(123)
	trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
	trainData <- df[trainIndex,]
	testData <- df[-trainIndex,]

	# Separate inputs and target for training
	trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
	trainTarget <- trainData$Fraud_Number

	# Train the random forest model
	rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)

	# Predict on the test set
	testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
	testTarget <- testData$Fraud_Number
	predictions <- predict(rf_model, testInput)
	binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

	# Evaluate model performance
	binary_predictions <- factor(binary_predictions, levels = c(0, 1))
	testTarget <- factor(testTarget, levels = c(0, 1))
	# Create the confusion matrix
	conf_matrix <- confusionMatrix(binary_predictions, testTarget)

	# Variable importance
	importance(rf_model)
	varImpPlot(rf_model)

	# Let's remove non important columns
	df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
	# --Let's again create model


	# Let's predict new_value
	new_data <- testInput[1,]
	View(new_data)
	predictions <- predict(rf_model, new_data)
	binary_predictions <- ifelse(predictions >= 0.5, 1, 0) # -------------------
	predictions

	# install.packages("pROC")
	# install.packages("ggplot2")
	# library(pROC)
	# library(ggplot2)

	# Let's calculate the ROC curve and AUC
	roc_obj <- roc(testData$Fraud_Number, predictions)
	auc_value <- auc(roc_obj)

	roc_df <- data.frame(
	tpr = roc_obj$sensitivities,
	fpr = 1 - roc_obj$specificities,
	thresholds = roc_obj$thresholds
	)

	ggplot(roc_df, aes(x = fpr, y = tpr)) +
	geom_line(color = "blue") +
	geom_abline(linetype = "dashed", color = "red") +
	labs(title = paste("ROC Curve (AUC =", round(auc_value, 3), ")"),
	x = "False Positive Rate",
	y = "True Positive Rate") +
	theme_minimal() # Great ✅


	## 5) Create Model Pipeline

	# Function 1:
	# Let's create model deployment function
	data_preprocessing_function <- function(df){
	df <- separate(df, Timestamp, into = c("Date", "Time"), sep = " ")
	df$Vehicle_Type <- str_to_title(df$Vehicle_Type)
	df <- df[, !colnames(df) %in% c('FastagID')]
	group <- function(x) {
	ifelse(x < 100, '<100',
	ifelse(x < 200, '100-200',
	ifelse(x < 300, '200-300', '300+')))
	}
	df$Transaction_Amount_Group <- group(df$Transaction_Amount)

	cut_points <- c(0, 100, 200, 300, Inf)
	group_names <- c('<100', '100-200', '200-300', '300+')
	# Let's create a new column with amount paid groups
	df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)

	cut_points <- c(0, 100, 200, 300, Inf)
	group_names <- c('<100', '100-200', '200-300', '300+')
	df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
	divide <- function(x) {
	ifelse(is.na(x) \| x == 0,
	0,
	round(df$Amount_paid / x, digits = 3)
	)
	}
	df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)
	df <- separate(df, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
	df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)
	df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
	df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)
	df$weekdays <- weekdays(df$Date)
	df$Time <- as.POSIXct(df$Time, format = "%H:%M")
	df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
	df <- df[, !colnames(df) %in% c("Time1")]

	return(df)
	}

	# Function 2:
	# Let's create model deployment function
	model_deployment_function <- function(df, model_file){
	df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID",'weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]

	cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
	"Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")

	df_dummies <- df %>%
	select(all_of(cols_to_dummy)) %>%
	model.matrix(~ . - 1, data = .) %>%
	as.data.frame()

	df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)
	df_main <<- df
	set.seed(123)
	trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
	trainData <- df[trainIndex,]
	testData <- df[-trainIndex,]

	trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
	trainTarget <- trainData$Fraud_Number

	rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)

	testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
	testTarget <- testData$Fraud_Number

	predictions <- predict(rf_model, testInput)
	binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

	# Let's evaluate model
	binary_predictions <- factor(binary_predictions, levels = c(0, 1))
	testTarget <- factor(testTarget, levels = c(0, 1))
	conf_matrix <- confusionMatrix(binary_predictions, testTarget)
	print(conf_matrix)

	# Save the model
	saveRDS(rf_model, model_file)

	return(df)
	}



	# Let's applying the model_function to the dataframe
	df_processed <- data_preprocessing_function(df)

	# Let's create model and save model file as rds
	model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
	processed_df <- model_deployment_function(df_processed, model_file)



	## 6) Model Deployment

	# Load the saved random forest model
	load_model_function <- function(model_file) {
	rf_model <- readRDS(model_file)
	return(rf_model)
	}

	# Function to make predictions using the loaded model
	predict_with_model <- function(rf_model, new_data) {
	# Preprocess new data
	new_data <- separate(new_data, Timestamp, into = c("Date", "Time"), sep = " ")
	new_data$Vehicle_Type <- str_to_title(new_data$Vehicle_Type)
	new_data <- new_data[, !colnames(new_data) %in% c('FastagID')]

	group <- function(x) {
	ifelse(x < 100, '<100',
	ifelse(x < 200, '100-200',
	ifelse(x < 300, '200-300', '300+')))
	}
	new_data$Transaction_Amount_Group <- group(new_data$Transaction_Amount)

	cut_points <- c(0, 100, 200, 300, Inf)
	group_names <- c('<100', '100-200', '200-300', '300+')

	new_data$Amount_Paid_Group <- cut(new_data$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
	new_data$Vehicle_Speed_Group <- cut(new_data$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)

	divide <- function(x) {
	ifelse(is.na(x) \| x == 0, 0, round(new_data$Amount_paid / x, digits = 3))
	}
	new_data$Transaction_Amount_Ratio <- divide(new_data$Transaction_Amount)

	new_data <- separate(new_data, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
	new_data$Date <- as.Date(new_data$Date, format = "%m/%d/%Y")
	new_data$weekend <- ifelse(weekdays(new_data$Date) %in% c("Saturday", "Sunday"), 1, 0)
	new_data$weekdays <- weekdays(new_data$Date)
	new_data$Time <- as.POSIXct(new_data$Time, format = "%H:%M")
	new_data$clock <- ifelse(hour(new_data$Time) >= 12 & hour(new_data$Time) < 24, 1, 0)

	new_data <- new_data[, !colnames(new_data) %in% c("Time1", "Time", "Fraud_indicator", "Date", "Longitude", "Latitude", "Vehicle_Plate_Number", "Transaction_ID", "weekend", "weekdayscümə axşamı", "weekdaysçərşənbə axşamı", "weekdaysşənbə")]

	cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions", "Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")

	# Ensure each categorical variable has at least two levels
	for (col in cols_to_dummy) {
	if (length(unique(new_data[[col]])) < 2) {
	new_data[[col]] <- factor(new_data[[col]], levels = c(unique(new_data[[col]]), "dummy_level"))
	}
	}

	new_data_dummies <- new_data %>%
	select(all_of(cols_to_dummy)) %>%
	model.matrix(~ . - 1, data = .) %>%
	as.data.frame()

	new_data <- bind_cols(new_data %>% select(-all_of(cols_to_dummy)), new_data_dummies)

	# List of columns to be checked and added if not present
	cols_to_add <- c(
	"Transaction_Amount", "Amount_paid", "Vehicle_Speed", "Transaction_Amount_Ratio",
	"clock", "Vehicle_TypeBus ", "Vehicle_TypeCar", "Vehicle_TypeMotorcycle",
	"Vehicle_TypeSedan", "Vehicle_TypeSuv", "Vehicle_TypeTruck", "Vehicle_TypeVan",
	"Lane_TypeRegular", "TollBoothIDB-102", "TollBoothIDC-103", "TollBoothIDD-104",
	"TollBoothIDD-105", "TollBoothIDD-106", "Vehicle_DimensionsMedium",
	"Vehicle_DimensionsSmall", "Transaction_Amount_Group100-200",
	"Transaction_Amount_Group200-300", "Transaction_Amount_Group300+",
	"Amount_Paid_Group100-200", "Amount_Paid_Group200-300", "Amount_Paid_Group300+",
	"Vehicle_Speed_Group100-200", "Vehicle_Speed_Group200-300", "Vehicle_Speed_Group300+",
	"weekdaysbazar ertəsi", "weekdayscümə", "weekdaysçərşənbə"
	)

	# Add missing columns with value 0
	missing_cols <- setdiff(cols_to_add, colnames(new_data))
	if (length(missing_cols) > 0) {
	new_data[, missing_cols] <- 0
	}

	# Select input features
	new_input <- new_data[, cols_to_add]

	# Make predictions
	predictions <- predict(rf_model, new_input)
	binary_predictions <- ifelse(predictions >= 0.5, 1, 0)

	return(binary_predictions)
	}



	# Example usage:
	model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
	rf_model <- load_model_function(model_file)

	# Example new data
	new_data <- data.frame(
	Transaction_ID = 1,
	Timestamp = c("1/6/2023 11:20"),
	Vehicle_Type = c("Car"),
	FastagID = c("12345"),
	TollBoothID = c("A-101"),
	Lane_Type = c("Express"),
	Vehicle_Dimensions = c("Medium"),
	Transaction_Amount = c(150),
	Amount_paid = c(110),
	Geographical_Location = c("34.0522118, 40.7128"),
	Vehicle_Speed = c(60),
	Vehicle_Plate_Number = c("ABC123")
	)

	# Make predictions
	predictions <- predict_with_model(rf_model, new_data)
	print(predictions)

	View(predictions)

	colnames(df_main)