Spaces:
Sleeping
Sleeping
Commit
•
656aee0
1
Parent(s):
9dce5e4
Upload FastagFraudDetectionProject.R
Browse files- FastagFraudDetectionProject.R +546 -0
FastagFraudDetectionProject.R
ADDED
@@ -0,0 +1,546 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
############################### R Project #################################
|
2 |
+
|
3 |
+
######################### Fastag Fraud Detection ###########################
|
4 |
+
|
5 |
+
## 1) Descriptive analysis
|
6 |
+
## 2) Data Preprocessing
|
7 |
+
## 3) Data Visualization
|
8 |
+
## 4) Model Development
|
9 |
+
## 5) Create Model Pipeline
|
10 |
+
## 6) Model Deployment
|
11 |
+
|
12 |
+
|
13 |
+
## 1) Descriptive analysis
|
14 |
+
|
15 |
+
# Let's load dataset
|
16 |
+
df <- read.csv('FastagFraudDetection.csv')
|
17 |
+
|
18 |
+
# Let's look at first 5 row
|
19 |
+
head(df)
|
20 |
+
|
21 |
+
# Let's look at dataframe with view
|
22 |
+
View(df)
|
23 |
+
|
24 |
+
# Let's look at dataframe info
|
25 |
+
str(df)
|
26 |
+
|
27 |
+
# Let's look at summary of dataframe
|
28 |
+
View(summary(df))
|
29 |
+
|
30 |
+
# Dimensions (rows and columns)
|
31 |
+
dim(df)
|
32 |
+
nrow(df)
|
33 |
+
ncol(df)
|
34 |
+
|
35 |
+
# Let's look at column names
|
36 |
+
names(df)
|
37 |
+
colnames(df)
|
38 |
+
|
39 |
+
# Let's look at target column
|
40 |
+
tail(df[,'Fraud_indicator'])
|
41 |
+
df[['Fraud_indicator']]
|
42 |
+
df$Fraud_indicator
|
43 |
+
|
44 |
+
# Let's look at vehicle columns
|
45 |
+
df[, c("Vehicle_Dimensions", "Vehicle_Speed",'Vehicle_Plate_Number','Vehicle_Type')]
|
46 |
+
|
47 |
+
# For select a first row
|
48 |
+
df[1, ]
|
49 |
+
|
50 |
+
# For select first three rows
|
51 |
+
df[1:3,]
|
52 |
+
|
53 |
+
# Let's filter rows where vehicle type is bus
|
54 |
+
head(df[df$Vehicle_Type == "Bus ", ],n = 2)
|
55 |
+
|
56 |
+
# Let's use subset function
|
57 |
+
nrow(subset(df, Transaction_Amount > 300))
|
58 |
+
|
59 |
+
# Let's look at count of null values over each columns
|
60 |
+
null_counts <- colSums(is.na(df))
|
61 |
+
null_counts
|
62 |
+
|
63 |
+
# Okey, missing data is not in dataframe :)
|
64 |
+
|
65 |
+
|
66 |
+
## 2) Data Preprocessing
|
67 |
+
|
68 |
+
# Let's divide timestamp column to time and date
|
69 |
+
df[['Date','Time']] <- df$Timestamp.split()
|
70 |
+
|
71 |
+
# Load necessary libraries
|
72 |
+
#install.packages("tidyverse")
|
73 |
+
#library(tidyverse)
|
74 |
+
|
75 |
+
# Let's check for rows with infinite values
|
76 |
+
rows_with_inf <- apply(df, 1, function(row) any(is.infinite(row)))
|
77 |
+
View(df[rows_with_inf, ])
|
78 |
+
|
79 |
+
# Let's split Timestamp into Date and Time
|
80 |
+
df <- df %>%
|
81 |
+
separate(Timestamp, into = c("Date", "Time"), sep = " ") #-- Okey, Great
|
82 |
+
|
83 |
+
# Let's do great only first letter of words of vehicle type
|
84 |
+
df$Vehicle_Type <- str_to_title(df$Vehicle_Type)
|
85 |
+
|
86 |
+
# Let's remove fastag id
|
87 |
+
df <- df[, !colnames(df) %in% c('FastagID')]
|
88 |
+
|
89 |
+
# Let's group transaction amount and amount paid column
|
90 |
+
group <- function(x) {
|
91 |
+
ifelse(x < 100, '<100',
|
92 |
+
ifelse(x < 200, '100-200',
|
93 |
+
ifelse(x < 300, '200-300', '300+')))
|
94 |
+
}
|
95 |
+
|
96 |
+
# Create a new column with transaction amount groups
|
97 |
+
df$Transaction_Amount_Group <- group(df$Transaction_Amount)
|
98 |
+
|
99 |
+
|
100 |
+
# SOLUTION 2:
|
101 |
+
|
102 |
+
# Let's define the amount paid groups
|
103 |
+
cut_points <- c(0, 100, 200, 300, Inf)
|
104 |
+
group_names <- c('<100', '100-200', '200-300', '300+')
|
105 |
+
|
106 |
+
# Let's create a new column with amount paid groups
|
107 |
+
df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
108 |
+
|
109 |
+
# Let's define the vehicle speed groups
|
110 |
+
cut_points <- c(0, 30, 60, 80 , 100 , Inf)
|
111 |
+
group_names <- c('<30', '30-60', '60-80', '80-100','100+')
|
112 |
+
|
113 |
+
# Let's create a new column with transaction amount groups
|
114 |
+
df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
115 |
+
|
116 |
+
# Let's find ratio transaction amount and amount paid
|
117 |
+
divide <- function(x) {
|
118 |
+
ifelse(is.na(x) | x == 0,
|
119 |
+
0,
|
120 |
+
round(df$Amount_paid / x, digits = 3)
|
121 |
+
)
|
122 |
+
}
|
123 |
+
|
124 |
+
df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)
|
125 |
+
|
126 |
+
# Let's seperate Geographical_Location column to long and lat
|
127 |
+
df <- df %>%
|
128 |
+
separate(Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
|
129 |
+
|
130 |
+
# Let's define fraud column as numeric for calculate process
|
131 |
+
df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)
|
132 |
+
|
133 |
+
# Let's look at columns
|
134 |
+
colnames(df)
|
135 |
+
|
136 |
+
# Let's add new columns whether weekend or weekday
|
137 |
+
df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
|
138 |
+
df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)
|
139 |
+
|
140 |
+
# Let's create column and add day of week
|
141 |
+
df$weekdays <- weekdays(df$Date)
|
142 |
+
|
143 |
+
# Let's create pm and am clock
|
144 |
+
df$Time1 <- as.POSIXct(df$Time, format = "%H:%M")
|
145 |
+
df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
|
146 |
+
df <- df[, !colnames(df) %in% c("Time1")]
|
147 |
+
|
148 |
+
## 3) Data Visualization
|
149 |
+
|
150 |
+
# • Scatter Plot
|
151 |
+
plot(df$Transaction_Amount, df$Vehicle_Speed, main = "Transaction Amount & Vehicle Speed", xlab = "Amount", ylab = "Speed", col = "darkgreen", pch = 1, cex = 1.2, font.main = 9)
|
152 |
+
|
153 |
+
# • Histogram
|
154 |
+
hist(df$Amount_paid, main = "Amount Paid Group", xlab = "Groups", ylab = "Frequency", col = "green", border = "blue", font.main = 3)
|
155 |
+
|
156 |
+
# • Bar Plot
|
157 |
+
barplot(table(df$Vehicle_Type), main = "Frequency of Vehicle Types", xlab = "Vehicle Type", ylab = "Frequency", col = "blue")
|
158 |
+
text(x = 1:length(counts), y = counts + 11, labels = counts, pos = 3, cex = 0.9, col = "black", xpd = TRUE)
|
159 |
+
|
160 |
+
# • Bar Plot
|
161 |
+
lane_type_avg <- aggregate(Fraud_Number ~ Lane_Type, data = df, mean)
|
162 |
+
barplot(lane_type_avg$Fraud_Number, main = "Frequency of Fraud by Lane Type", xlab = "Lane Type",ylab = "Frequency", col = "blue", names.arg = lane_type_avg$Lane_Type)
|
163 |
+
|
164 |
+
# • Box Plot
|
165 |
+
boxplot(df$Transaction_Amount, df$Amount_paid, names = c("Transaction Amount", "Amount Paid"), main = "Transaction Amount & Amount Paid", col = c("blue", "green"), border = "black", font.main = 3)
|
166 |
+
boxplot(df$Vehicle_Speed, names = c("Speed"), main = "Vehicle Speed", col = c("green"), border = "black", font.main = 3)
|
167 |
+
boxplot(df$Transaction_Amount_Ratio, names = c("Ratio"), main = "Transaction Amount Ratio", col = c("red"), border = "blue", font.main = 3)
|
168 |
+
|
169 |
+
# • Pie Chart
|
170 |
+
pie(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)
|
171 |
+
|
172 |
+
# • Pie Chart
|
173 |
+
#install.packages("plotrix")
|
174 |
+
#library("plotrix")
|
175 |
+
pie3D(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)
|
176 |
+
|
177 |
+
# • Line Plot
|
178 |
+
library("tidyverse")
|
179 |
+
df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
|
180 |
+
daily_sum <- aggregate(Transaction_Amount ~ Date, data = df, sum)
|
181 |
+
plot(daily_sum$Date, daily_sum$Transaction_Amount, type = "l",
|
182 |
+
main = "Daily Sum of Transaction Amount",
|
183 |
+
xlab = "Date", ylab = "Sum of Transaction Amount",
|
184 |
+
col = "red", lwd = 2, font.main = 2)
|
185 |
+
|
186 |
+
# Let's extract analyze by month
|
187 |
+
df$MonthName <- format(df$Date, "%B")
|
188 |
+
monthly_avg <- aggregate(Transaction_Amount ~ MonthName, data = df, mean)
|
189 |
+
|
190 |
+
monthly_avg$MonthName <- factor(monthly_avg$MonthName,
|
191 |
+
levels = c("Yanvar", "Fevral", "Mart", "Aprel", "May", "İyun",
|
192 |
+
"İyul", "Avqust", "Sentyabr", "Oktyabr", "Noyabr", "Dekabr"),
|
193 |
+
ordered = TRUE)
|
194 |
+
|
195 |
+
plot(monthly_avg$MonthName, monthly_avg$Transaction_Amount, type = "l",
|
196 |
+
main = "Daily Sum of Transaction Amount",
|
197 |
+
xlab = "Date", ylab = "Sum of Transaction Amount",
|
198 |
+
col = "red", lwd = 2, font.main = 2)
|
199 |
+
|
200 |
+
# • Scatter with ggplot2 Plot
|
201 |
+
library(ggplot2)
|
202 |
+
ggplot(df, aes(x = df$Transaction_ID, y = df$Transaction_Amount)) +
|
203 |
+
geom_point(color = "blue", size = 3) +
|
204 |
+
labs(title = "Transaction Id & Amount", x = "Transaction Id", y = "Amount") +
|
205 |
+
theme_bw()
|
206 |
+
|
207 |
+
# • Bar plot with ggplot2
|
208 |
+
vehicle_type_avg <- aggregate(Amount_paid ~ Vehicle_Type, data = df, mean)
|
209 |
+
|
210 |
+
ggplot(vehicle_type_avg, aes(x = Vehicle_Type, y = Amount_paid)) +
|
211 |
+
geom_bar(stat = "identity", fill = "orange", color = "black") +
|
212 |
+
labs(title = "Average Amount Paid by Vehicle Type", x = "Vehicle Type", y = "Average Amount Paid") +
|
213 |
+
theme_bw()
|
214 |
+
|
215 |
+
|
216 |
+
# Let's find transaction counts by longitude and latitude
|
217 |
+
transaction_counts <- aggregate(Transaction_ID ~ Longitude + Latitude, data = df, FUN = length)
|
218 |
+
|
219 |
+
# Plot the map
|
220 |
+
install.packages("maps")
|
221 |
+
library(maps)
|
222 |
+
|
223 |
+
world <- map_data("world")
|
224 |
+
transaction_counts$Longitude <- as.numeric(as.character(transaction_counts$Longitude))
|
225 |
+
transaction_counts$Latitude <- as.numeric(as.character(transaction_counts$Latitude))
|
226 |
+
|
227 |
+
# Plot the map
|
228 |
+
ggplot() +
|
229 |
+
geom_polygon(data = world, aes(x = long, y = lat, group = group), fill = "lightgray", color = "black") +
|
230 |
+
geom_point(data = transaction_counts, aes(x = Longitude, y = Latitude, size = Transaction_ID), color = "red") +
|
231 |
+
labs(title = "Transaction Count by Location", x = "Longitude", y = "Latitude", size = "Transaction Count") +
|
232 |
+
theme_minimal()
|
233 |
+
|
234 |
+
|
235 |
+
## 4) Model Development
|
236 |
+
|
237 |
+
# Let's analyze needs : Analyzing Fastag fraud involves identifying
|
238 |
+
# patterns and trends in fraudulent transactions to enhance system
|
239 |
+
# security and user trust. By leveraging data analytics and machine
|
240 |
+
# learning, businesses can predict and prevent future fraudulent activities.
|
241 |
+
# This proactive approach helps mitigate financial losses and ensures
|
242 |
+
# the integrity of the Fastag system. Ultimately, maintaining a secure
|
243 |
+
# and reliable Fastag system promotes user satisfaction and supports
|
244 |
+
# efficient toll collection.
|
245 |
+
|
246 |
+
# Let's remove unnecessary columns from dataframe
|
247 |
+
|
248 |
+
df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID")]
|
249 |
+
# df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
|
250 |
+
|
251 |
+
# Let's dummy some columns
|
252 |
+
library(dplyr)
|
253 |
+
cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
|
254 |
+
"Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
|
255 |
+
|
256 |
+
# Creating dummy variables
|
257 |
+
df_dummies <- df %>%
|
258 |
+
select(all_of(cols_to_dummy)) %>%
|
259 |
+
model.matrix(~ . - 1, data = .) %>%
|
260 |
+
as.data.frame()
|
261 |
+
|
262 |
+
# Combining the dummies with the original dataframe excluding the original columns
|
263 |
+
df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)
|
264 |
+
|
265 |
+
# install.packages("randomForest")
|
266 |
+
library(randomForest)
|
267 |
+
library(caret)
|
268 |
+
|
269 |
+
# Split the data into training and testing sets
|
270 |
+
set.seed(123)
|
271 |
+
trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
|
272 |
+
trainData <- df[trainIndex,]
|
273 |
+
testData <- df[-trainIndex,]
|
274 |
+
|
275 |
+
# Separate inputs and target for training
|
276 |
+
trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
|
277 |
+
trainTarget <- trainData$Fraud_Number
|
278 |
+
|
279 |
+
# Train the random forest model
|
280 |
+
rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)
|
281 |
+
|
282 |
+
# Predict on the test set
|
283 |
+
testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
|
284 |
+
testTarget <- testData$Fraud_Number
|
285 |
+
predictions <- predict(rf_model, testInput)
|
286 |
+
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
|
287 |
+
|
288 |
+
# Evaluate model performance
|
289 |
+
binary_predictions <- factor(binary_predictions, levels = c(0, 1))
|
290 |
+
testTarget <- factor(testTarget, levels = c(0, 1))
|
291 |
+
# Create the confusion matrix
|
292 |
+
conf_matrix <- confusionMatrix(binary_predictions, testTarget)
|
293 |
+
|
294 |
+
# Variable importance
|
295 |
+
importance(rf_model)
|
296 |
+
varImpPlot(rf_model)
|
297 |
+
|
298 |
+
# Let's remove non important columns
|
299 |
+
df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
|
300 |
+
# --Let's again create model
|
301 |
+
|
302 |
+
|
303 |
+
# Let's predict new_value
|
304 |
+
new_data <- testInput[1,]
|
305 |
+
View(new_data)
|
306 |
+
predictions <- predict(rf_model, new_data)
|
307 |
+
binary_predictions <- ifelse(predictions >= 0.5, 1, 0) # -------------------
|
308 |
+
predictions
|
309 |
+
|
310 |
+
# install.packages("pROC")
|
311 |
+
# install.packages("ggplot2")
|
312 |
+
# library(pROC)
|
313 |
+
# library(ggplot2)
|
314 |
+
|
315 |
+
# Let's calculate the ROC curve and AUC
|
316 |
+
roc_obj <- roc(testData$Fraud_Number, predictions)
|
317 |
+
auc_value <- auc(roc_obj)
|
318 |
+
|
319 |
+
roc_df <- data.frame(
|
320 |
+
tpr = roc_obj$sensitivities,
|
321 |
+
fpr = 1 - roc_obj$specificities,
|
322 |
+
thresholds = roc_obj$thresholds
|
323 |
+
)
|
324 |
+
|
325 |
+
ggplot(roc_df, aes(x = fpr, y = tpr)) +
|
326 |
+
geom_line(color = "blue") +
|
327 |
+
geom_abline(linetype = "dashed", color = "red") +
|
328 |
+
labs(title = paste("ROC Curve (AUC =", round(auc_value, 3), ")"),
|
329 |
+
x = "False Positive Rate",
|
330 |
+
y = "True Positive Rate") +
|
331 |
+
theme_minimal() # Great ✅
|
332 |
+
|
333 |
+
|
334 |
+
## 5) Create Model Pipeline
|
335 |
+
|
336 |
+
# Function 1:
|
337 |
+
# Let's create model deployment function
|
338 |
+
data_preprocessing_function <- function(df){
|
339 |
+
df <- separate(df, Timestamp, into = c("Date", "Time"), sep = " ")
|
340 |
+
df$Vehicle_Type <- str_to_title(df$Vehicle_Type)
|
341 |
+
df <- df[, !colnames(df) %in% c('FastagID')]
|
342 |
+
group <- function(x) {
|
343 |
+
ifelse(x < 100, '<100',
|
344 |
+
ifelse(x < 200, '100-200',
|
345 |
+
ifelse(x < 300, '200-300', '300+')))
|
346 |
+
}
|
347 |
+
df$Transaction_Amount_Group <- group(df$Transaction_Amount)
|
348 |
+
|
349 |
+
cut_points <- c(0, 100, 200, 300, Inf)
|
350 |
+
group_names <- c('<100', '100-200', '200-300', '300+')
|
351 |
+
# Let's create a new column with amount paid groups
|
352 |
+
df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
353 |
+
|
354 |
+
cut_points <- c(0, 100, 200, 300, Inf)
|
355 |
+
group_names <- c('<100', '100-200', '200-300', '300+')
|
356 |
+
df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
357 |
+
divide <- function(x) {
|
358 |
+
ifelse(is.na(x) | x == 0,
|
359 |
+
0,
|
360 |
+
round(df$Amount_paid / x, digits = 3)
|
361 |
+
)
|
362 |
+
}
|
363 |
+
df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)
|
364 |
+
df <- separate(df, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
|
365 |
+
df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)
|
366 |
+
df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
|
367 |
+
df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)
|
368 |
+
df$weekdays <- weekdays(df$Date)
|
369 |
+
df$Time <- as.POSIXct(df$Time, format = "%H:%M")
|
370 |
+
df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
|
371 |
+
df <- df[, !colnames(df) %in% c("Time1")]
|
372 |
+
|
373 |
+
return(df)
|
374 |
+
}
|
375 |
+
|
376 |
+
# Function 2:
|
377 |
+
# Let's create model deployment function
|
378 |
+
model_deployment_function <- function(df, model_file){
|
379 |
+
df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID",'weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
|
380 |
+
|
381 |
+
cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
|
382 |
+
"Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
|
383 |
+
|
384 |
+
df_dummies <- df %>%
|
385 |
+
select(all_of(cols_to_dummy)) %>%
|
386 |
+
model.matrix(~ . - 1, data = .) %>%
|
387 |
+
as.data.frame()
|
388 |
+
|
389 |
+
df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)
|
390 |
+
df_main <<- df
|
391 |
+
set.seed(123)
|
392 |
+
trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
|
393 |
+
trainData <- df[trainIndex,]
|
394 |
+
testData <- df[-trainIndex,]
|
395 |
+
|
396 |
+
trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
|
397 |
+
trainTarget <- trainData$Fraud_Number
|
398 |
+
|
399 |
+
rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)
|
400 |
+
|
401 |
+
testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
|
402 |
+
testTarget <- testData$Fraud_Number
|
403 |
+
|
404 |
+
predictions <- predict(rf_model, testInput)
|
405 |
+
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
|
406 |
+
|
407 |
+
# Let's evaluate model
|
408 |
+
binary_predictions <- factor(binary_predictions, levels = c(0, 1))
|
409 |
+
testTarget <- factor(testTarget, levels = c(0, 1))
|
410 |
+
conf_matrix <- confusionMatrix(binary_predictions, testTarget)
|
411 |
+
print(conf_matrix)
|
412 |
+
|
413 |
+
# Save the model
|
414 |
+
saveRDS(rf_model, model_file)
|
415 |
+
|
416 |
+
return(df)
|
417 |
+
}
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
# Let's applying the model_function to the dataframe
|
422 |
+
df_processed <- data_preprocessing_function(df)
|
423 |
+
|
424 |
+
# Let's create model and save model file as rds
|
425 |
+
model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
|
426 |
+
processed_df <- model_deployment_function(df_processed, model_file)
|
427 |
+
|
428 |
+
|
429 |
+
|
430 |
+
## 6) Model Deployment
|
431 |
+
|
432 |
+
# Load the saved random forest model
|
433 |
+
load_model_function <- function(model_file) {
|
434 |
+
rf_model <- readRDS(model_file)
|
435 |
+
return(rf_model)
|
436 |
+
}
|
437 |
+
|
438 |
+
# Function to make predictions using the loaded model
|
439 |
+
predict_with_model <- function(rf_model, new_data) {
|
440 |
+
# Preprocess new data
|
441 |
+
new_data <- separate(new_data, Timestamp, into = c("Date", "Time"), sep = " ")
|
442 |
+
new_data$Vehicle_Type <- str_to_title(new_data$Vehicle_Type)
|
443 |
+
new_data <- new_data[, !colnames(new_data) %in% c('FastagID')]
|
444 |
+
|
445 |
+
group <- function(x) {
|
446 |
+
ifelse(x < 100, '<100',
|
447 |
+
ifelse(x < 200, '100-200',
|
448 |
+
ifelse(x < 300, '200-300', '300+')))
|
449 |
+
}
|
450 |
+
new_data$Transaction_Amount_Group <- group(new_data$Transaction_Amount)
|
451 |
+
|
452 |
+
cut_points <- c(0, 100, 200, 300, Inf)
|
453 |
+
group_names <- c('<100', '100-200', '200-300', '300+')
|
454 |
+
|
455 |
+
new_data$Amount_Paid_Group <- cut(new_data$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
456 |
+
new_data$Vehicle_Speed_Group <- cut(new_data$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
|
457 |
+
|
458 |
+
divide <- function(x) {
|
459 |
+
ifelse(is.na(x) | x == 0, 0, round(new_data$Amount_paid / x, digits = 3))
|
460 |
+
}
|
461 |
+
new_data$Transaction_Amount_Ratio <- divide(new_data$Transaction_Amount)
|
462 |
+
|
463 |
+
new_data <- separate(new_data, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
|
464 |
+
new_data$Date <- as.Date(new_data$Date, format = "%m/%d/%Y")
|
465 |
+
new_data$weekend <- ifelse(weekdays(new_data$Date) %in% c("Saturday", "Sunday"), 1, 0)
|
466 |
+
new_data$weekdays <- weekdays(new_data$Date)
|
467 |
+
new_data$Time <- as.POSIXct(new_data$Time, format = "%H:%M")
|
468 |
+
new_data$clock <- ifelse(hour(new_data$Time) >= 12 & hour(new_data$Time) < 24, 1, 0)
|
469 |
+
|
470 |
+
new_data <- new_data[, !colnames(new_data) %in% c("Time1", "Time", "Fraud_indicator", "Date", "Longitude", "Latitude", "Vehicle_Plate_Number", "Transaction_ID", "weekend", "weekdayscümə axşamı", "weekdaysçərşənbə axşamı", "weekdaysşənbə")]
|
471 |
+
|
472 |
+
cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions", "Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
|
473 |
+
|
474 |
+
# Ensure each categorical variable has at least two levels
|
475 |
+
for (col in cols_to_dummy) {
|
476 |
+
if (length(unique(new_data[[col]])) < 2) {
|
477 |
+
new_data[[col]] <- factor(new_data[[col]], levels = c(unique(new_data[[col]]), "dummy_level"))
|
478 |
+
}
|
479 |
+
}
|
480 |
+
|
481 |
+
new_data_dummies <- new_data %>%
|
482 |
+
select(all_of(cols_to_dummy)) %>%
|
483 |
+
model.matrix(~ . - 1, data = .) %>%
|
484 |
+
as.data.frame()
|
485 |
+
|
486 |
+
new_data <- bind_cols(new_data %>% select(-all_of(cols_to_dummy)), new_data_dummies)
|
487 |
+
|
488 |
+
# List of columns to be checked and added if not present
|
489 |
+
cols_to_add <- c(
|
490 |
+
"Transaction_Amount", "Amount_paid", "Vehicle_Speed", "Transaction_Amount_Ratio",
|
491 |
+
"clock", "Vehicle_TypeBus ", "Vehicle_TypeCar", "Vehicle_TypeMotorcycle",
|
492 |
+
"Vehicle_TypeSedan", "Vehicle_TypeSuv", "Vehicle_TypeTruck", "Vehicle_TypeVan",
|
493 |
+
"Lane_TypeRegular", "TollBoothIDB-102", "TollBoothIDC-103", "TollBoothIDD-104",
|
494 |
+
"TollBoothIDD-105", "TollBoothIDD-106", "Vehicle_DimensionsMedium",
|
495 |
+
"Vehicle_DimensionsSmall", "Transaction_Amount_Group100-200",
|
496 |
+
"Transaction_Amount_Group200-300", "Transaction_Amount_Group300+",
|
497 |
+
"Amount_Paid_Group100-200", "Amount_Paid_Group200-300", "Amount_Paid_Group300+",
|
498 |
+
"Vehicle_Speed_Group100-200", "Vehicle_Speed_Group200-300", "Vehicle_Speed_Group300+",
|
499 |
+
"weekdaysbazar ertəsi", "weekdayscümə", "weekdaysçərşənbə"
|
500 |
+
)
|
501 |
+
|
502 |
+
# Add missing columns with value 0
|
503 |
+
missing_cols <- setdiff(cols_to_add, colnames(new_data))
|
504 |
+
if (length(missing_cols) > 0) {
|
505 |
+
new_data[, missing_cols] <- 0
|
506 |
+
}
|
507 |
+
|
508 |
+
# Select input features
|
509 |
+
new_input <- new_data[, cols_to_add]
|
510 |
+
|
511 |
+
# Make predictions
|
512 |
+
predictions <- predict(rf_model, new_input)
|
513 |
+
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
|
514 |
+
|
515 |
+
return(binary_predictions)
|
516 |
+
}
|
517 |
+
|
518 |
+
|
519 |
+
|
520 |
+
# Example usage:
|
521 |
+
model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
|
522 |
+
rf_model <- load_model_function(model_file)
|
523 |
+
|
524 |
+
# Example new data
|
525 |
+
new_data <- data.frame(
|
526 |
+
Transaction_ID = 1,
|
527 |
+
Timestamp = c("1/6/2023 11:20"),
|
528 |
+
Vehicle_Type = c("Car"),
|
529 |
+
FastagID = c("12345"),
|
530 |
+
TollBoothID = c("A-101"),
|
531 |
+
Lane_Type = c("Express"),
|
532 |
+
Vehicle_Dimensions = c("Medium"),
|
533 |
+
Transaction_Amount = c(150),
|
534 |
+
Amount_paid = c(110),
|
535 |
+
Geographical_Location = c("34.0522118, 40.7128"),
|
536 |
+
Vehicle_Speed = c(60),
|
537 |
+
Vehicle_Plate_Number = c("ABC123")
|
538 |
+
)
|
539 |
+
|
540 |
+
# Make predictions
|
541 |
+
predictions <- predict_with_model(rf_model, new_data)
|
542 |
+
print(predictions)
|
543 |
+
|
544 |
+
View(predictions)
|
545 |
+
|
546 |
+
colnames(df_main)
|