riyadahmadov commited on
Commit
656aee0
1 Parent(s): 9dce5e4

Upload FastagFraudDetectionProject.R

Browse files
Files changed (1) hide show
  1. FastagFraudDetectionProject.R +546 -0
FastagFraudDetectionProject.R ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############################### R Project #################################
2
+
3
+ ######################### Fastag Fraud Detection ###########################
4
+
5
+ ## 1) Descriptive analysis
6
+ ## 2) Data Preprocessing
7
+ ## 3) Data Visualization
8
+ ## 4) Model Development
9
+ ## 5) Create Model Pipeline
10
+ ## 6) Model Deployment
11
+
12
+
13
+ ## 1) Descriptive analysis
14
+
15
+ # Let's load dataset
16
+ df <- read.csv('FastagFraudDetection.csv')
17
+
18
+ # Let's look at first 5 row
19
+ head(df)
20
+
21
+ # Let's look at dataframe with view
22
+ View(df)
23
+
24
+ # Let's look at dataframe info
25
+ str(df)
26
+
27
+ # Let's look at summary of dataframe
28
+ View(summary(df))
29
+
30
+ # Dimensions (rows and columns)
31
+ dim(df)
32
+ nrow(df)
33
+ ncol(df)
34
+
35
+ # Let's look at column names
36
+ names(df)
37
+ colnames(df)
38
+
39
+ # Let's look at target column
40
+ tail(df[,'Fraud_indicator'])
41
+ df[['Fraud_indicator']]
42
+ df$Fraud_indicator
43
+
44
+ # Let's look at vehicle columns
45
+ df[, c("Vehicle_Dimensions", "Vehicle_Speed",'Vehicle_Plate_Number','Vehicle_Type')]
46
+
47
+ # For select a first row
48
+ df[1, ]
49
+
50
+ # For select first three rows
51
+ df[1:3,]
52
+
53
+ # Let's filter rows where vehicle type is bus
54
+ head(df[df$Vehicle_Type == "Bus ", ],n = 2)
55
+
56
+ # Let's use subset function
57
+ nrow(subset(df, Transaction_Amount > 300))
58
+
59
+ # Let's look at count of null values over each columns
60
+ null_counts <- colSums(is.na(df))
61
+ null_counts
62
+
63
+ # Okey, missing data is not in dataframe :)
64
+
65
+
66
+ ## 2) Data Preprocessing
67
+
68
+ # Let's divide timestamp column to time and date
69
+ df[['Date','Time']] <- df$Timestamp.split()
70
+
71
+ # Load necessary libraries
72
+ #install.packages("tidyverse")
73
+ #library(tidyverse)
74
+
75
+ # Let's check for rows with infinite values
76
+ rows_with_inf <- apply(df, 1, function(row) any(is.infinite(row)))
77
+ View(df[rows_with_inf, ])
78
+
79
+ # Let's split Timestamp into Date and Time
80
+ df <- df %>%
81
+ separate(Timestamp, into = c("Date", "Time"), sep = " ") #-- Okey, Great
82
+
83
+ # Let's do great only first letter of words of vehicle type
84
+ df$Vehicle_Type <- str_to_title(df$Vehicle_Type)
85
+
86
+ # Let's remove fastag id
87
+ df <- df[, !colnames(df) %in% c('FastagID')]
88
+
89
+ # Let's group transaction amount and amount paid column
90
+ group <- function(x) {
91
+ ifelse(x < 100, '<100',
92
+ ifelse(x < 200, '100-200',
93
+ ifelse(x < 300, '200-300', '300+')))
94
+ }
95
+
96
+ # Create a new column with transaction amount groups
97
+ df$Transaction_Amount_Group <- group(df$Transaction_Amount)
98
+
99
+
100
+ # SOLUTION 2:
101
+
102
+ # Let's define the amount paid groups
103
+ cut_points <- c(0, 100, 200, 300, Inf)
104
+ group_names <- c('<100', '100-200', '200-300', '300+')
105
+
106
+ # Let's create a new column with amount paid groups
107
+ df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
108
+
109
+ # Let's define the vehicle speed groups
110
+ cut_points <- c(0, 30, 60, 80 , 100 , Inf)
111
+ group_names <- c('<30', '30-60', '60-80', '80-100','100+')
112
+
113
+ # Let's create a new column with transaction amount groups
114
+ df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
115
+
116
+ # Let's find ratio transaction amount and amount paid
117
+ divide <- function(x) {
118
+ ifelse(is.na(x) | x == 0,
119
+ 0,
120
+ round(df$Amount_paid / x, digits = 3)
121
+ )
122
+ }
123
+
124
+ df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)
125
+
126
+ # Let's seperate Geographical_Location column to long and lat
127
+ df <- df %>%
128
+ separate(Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
129
+
130
+ # Let's define fraud column as numeric for calculate process
131
+ df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)
132
+
133
+ # Let's look at columns
134
+ colnames(df)
135
+
136
+ # Let's add new columns whether weekend or weekday
137
+ df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
138
+ df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)
139
+
140
+ # Let's create column and add day of week
141
+ df$weekdays <- weekdays(df$Date)
142
+
143
+ # Let's create pm and am clock
144
+ df$Time1 <- as.POSIXct(df$Time, format = "%H:%M")
145
+ df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
146
+ df <- df[, !colnames(df) %in% c("Time1")]
147
+
148
+ ## 3) Data Visualization
149
+
150
+ # • Scatter Plot
151
+ plot(df$Transaction_Amount, df$Vehicle_Speed, main = "Transaction Amount & Vehicle Speed", xlab = "Amount", ylab = "Speed", col = "darkgreen", pch = 1, cex = 1.2, font.main = 9)
152
+
153
+ # • Histogram
154
+ hist(df$Amount_paid, main = "Amount Paid Group", xlab = "Groups", ylab = "Frequency", col = "green", border = "blue", font.main = 3)
155
+
156
+ # • Bar Plot
157
+ barplot(table(df$Vehicle_Type), main = "Frequency of Vehicle Types", xlab = "Vehicle Type", ylab = "Frequency", col = "blue")
158
+ text(x = 1:length(counts), y = counts + 11, labels = counts, pos = 3, cex = 0.9, col = "black", xpd = TRUE)
159
+
160
+ # • Bar Plot
161
+ lane_type_avg <- aggregate(Fraud_Number ~ Lane_Type, data = df, mean)
162
+ barplot(lane_type_avg$Fraud_Number, main = "Frequency of Fraud by Lane Type", xlab = "Lane Type",ylab = "Frequency", col = "blue", names.arg = lane_type_avg$Lane_Type)
163
+
164
+ # • Box Plot
165
+ boxplot(df$Transaction_Amount, df$Amount_paid, names = c("Transaction Amount", "Amount Paid"), main = "Transaction Amount & Amount Paid", col = c("blue", "green"), border = "black", font.main = 3)
166
+ boxplot(df$Vehicle_Speed, names = c("Speed"), main = "Vehicle Speed", col = c("green"), border = "black", font.main = 3)
167
+ boxplot(df$Transaction_Amount_Ratio, names = c("Ratio"), main = "Transaction Amount Ratio", col = c("red"), border = "blue", font.main = 3)
168
+
169
+ # • Pie Chart
170
+ pie(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)
171
+
172
+ # • Pie Chart
173
+ #install.packages("plotrix")
174
+ #library("plotrix")
175
+ pie3D(table(df$Transaction_Amount_Group), labels = levels(factor(df$Transaction_Amount_Group)) , main = "Transaction Amount Group", col = rainbow(length(levels(factor(df$Transaction_Amount_Group)))), border = "darkred", font.main = 4)
176
+
177
+ # • Line Plot
178
+ library("tidyverse")
179
+ df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
180
+ daily_sum <- aggregate(Transaction_Amount ~ Date, data = df, sum)
181
+ plot(daily_sum$Date, daily_sum$Transaction_Amount, type = "l",
182
+ main = "Daily Sum of Transaction Amount",
183
+ xlab = "Date", ylab = "Sum of Transaction Amount",
184
+ col = "red", lwd = 2, font.main = 2)
185
+
186
+ # Let's extract analyze by month
187
+ df$MonthName <- format(df$Date, "%B")
188
+ monthly_avg <- aggregate(Transaction_Amount ~ MonthName, data = df, mean)
189
+
190
+ monthly_avg$MonthName <- factor(monthly_avg$MonthName,
191
+ levels = c("Yanvar", "Fevral", "Mart", "Aprel", "May", "İyun",
192
+ "İyul", "Avqust", "Sentyabr", "Oktyabr", "Noyabr", "Dekabr"),
193
+ ordered = TRUE)
194
+
195
+ plot(monthly_avg$MonthName, monthly_avg$Transaction_Amount, type = "l",
196
+ main = "Daily Sum of Transaction Amount",
197
+ xlab = "Date", ylab = "Sum of Transaction Amount",
198
+ col = "red", lwd = 2, font.main = 2)
199
+
200
+ # • Scatter with ggplot2 Plot
201
+ library(ggplot2)
202
+ ggplot(df, aes(x = df$Transaction_ID, y = df$Transaction_Amount)) +
203
+ geom_point(color = "blue", size = 3) +
204
+ labs(title = "Transaction Id & Amount", x = "Transaction Id", y = "Amount") +
205
+ theme_bw()
206
+
207
+ # • Bar plot with ggplot2
208
+ vehicle_type_avg <- aggregate(Amount_paid ~ Vehicle_Type, data = df, mean)
209
+
210
+ ggplot(vehicle_type_avg, aes(x = Vehicle_Type, y = Amount_paid)) +
211
+ geom_bar(stat = "identity", fill = "orange", color = "black") +
212
+ labs(title = "Average Amount Paid by Vehicle Type", x = "Vehicle Type", y = "Average Amount Paid") +
213
+ theme_bw()
214
+
215
+
216
+ # Let's find transaction counts by longitude and latitude
217
+ transaction_counts <- aggregate(Transaction_ID ~ Longitude + Latitude, data = df, FUN = length)
218
+
219
+ # Plot the map
220
+ install.packages("maps")
221
+ library(maps)
222
+
223
+ world <- map_data("world")
224
+ transaction_counts$Longitude <- as.numeric(as.character(transaction_counts$Longitude))
225
+ transaction_counts$Latitude <- as.numeric(as.character(transaction_counts$Latitude))
226
+
227
+ # Plot the map
228
+ ggplot() +
229
+ geom_polygon(data = world, aes(x = long, y = lat, group = group), fill = "lightgray", color = "black") +
230
+ geom_point(data = transaction_counts, aes(x = Longitude, y = Latitude, size = Transaction_ID), color = "red") +
231
+ labs(title = "Transaction Count by Location", x = "Longitude", y = "Latitude", size = "Transaction Count") +
232
+ theme_minimal()
233
+
234
+
235
+ ## 4) Model Development
236
+
237
+ # Let's analyze needs : Analyzing Fastag fraud involves identifying
238
+ # patterns and trends in fraudulent transactions to enhance system
239
+ # security and user trust. By leveraging data analytics and machine
240
+ # learning, businesses can predict and prevent future fraudulent activities.
241
+ # This proactive approach helps mitigate financial losses and ensures
242
+ # the integrity of the Fastag system. Ultimately, maintaining a secure
243
+ # and reliable Fastag system promotes user satisfaction and supports
244
+ # efficient toll collection.
245
+
246
+ # Let's remove unnecessary columns from dataframe
247
+
248
+ df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID")]
249
+ # df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
250
+
251
+ # Let's dummy some columns
252
+ library(dplyr)
253
+ cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
254
+ "Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
255
+
256
+ # Creating dummy variables
257
+ df_dummies <- df %>%
258
+ select(all_of(cols_to_dummy)) %>%
259
+ model.matrix(~ . - 1, data = .) %>%
260
+ as.data.frame()
261
+
262
+ # Combining the dummies with the original dataframe excluding the original columns
263
+ df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)
264
+
265
+ # install.packages("randomForest")
266
+ library(randomForest)
267
+ library(caret)
268
+
269
+ # Split the data into training and testing sets
270
+ set.seed(123)
271
+ trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
272
+ trainData <- df[trainIndex,]
273
+ testData <- df[-trainIndex,]
274
+
275
+ # Separate inputs and target for training
276
+ trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
277
+ trainTarget <- trainData$Fraud_Number
278
+
279
+ # Train the random forest model
280
+ rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)
281
+
282
+ # Predict on the test set
283
+ testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
284
+ testTarget <- testData$Fraud_Number
285
+ predictions <- predict(rf_model, testInput)
286
+ binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
287
+
288
+ # Evaluate model performance
289
+ binary_predictions <- factor(binary_predictions, levels = c(0, 1))
290
+ testTarget <- factor(testTarget, levels = c(0, 1))
291
+ # Create the confusion matrix
292
+ conf_matrix <- confusionMatrix(binary_predictions, testTarget)
293
+
294
+ # Variable importance
295
+ importance(rf_model)
296
+ varImpPlot(rf_model)
297
+
298
+ # Let's remove non important columns
299
+ df <- df[,!colnames(df) %in% c('weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
300
+ # --Let's again create model
301
+
302
+
303
+ # Let's predict new_value
304
+ new_data <- testInput[1,]
305
+ View(new_data)
306
+ predictions <- predict(rf_model, new_data)
307
+ binary_predictions <- ifelse(predictions >= 0.5, 1, 0) # -------------------
308
+ predictions
309
+
310
+ # install.packages("pROC")
311
+ # install.packages("ggplot2")
312
+ # library(pROC)
313
+ # library(ggplot2)
314
+
315
+ # Let's calculate the ROC curve and AUC
316
+ roc_obj <- roc(testData$Fraud_Number, predictions)
317
+ auc_value <- auc(roc_obj)
318
+
319
+ roc_df <- data.frame(
320
+ tpr = roc_obj$sensitivities,
321
+ fpr = 1 - roc_obj$specificities,
322
+ thresholds = roc_obj$thresholds
323
+ )
324
+
325
+ ggplot(roc_df, aes(x = fpr, y = tpr)) +
326
+ geom_line(color = "blue") +
327
+ geom_abline(linetype = "dashed", color = "red") +
328
+ labs(title = paste("ROC Curve (AUC =", round(auc_value, 3), ")"),
329
+ x = "False Positive Rate",
330
+ y = "True Positive Rate") +
331
+ theme_minimal() # Great ✅
332
+
333
+
334
+ ## 5) Create Model Pipeline
335
+
336
+ # Function 1:
337
+ # Let's create model deployment function
338
+ data_preprocessing_function <- function(df){
339
+ df <- separate(df, Timestamp, into = c("Date", "Time"), sep = " ")
340
+ df$Vehicle_Type <- str_to_title(df$Vehicle_Type)
341
+ df <- df[, !colnames(df) %in% c('FastagID')]
342
+ group <- function(x) {
343
+ ifelse(x < 100, '<100',
344
+ ifelse(x < 200, '100-200',
345
+ ifelse(x < 300, '200-300', '300+')))
346
+ }
347
+ df$Transaction_Amount_Group <- group(df$Transaction_Amount)
348
+
349
+ cut_points <- c(0, 100, 200, 300, Inf)
350
+ group_names <- c('<100', '100-200', '200-300', '300+')
351
+ # Let's create a new column with amount paid groups
352
+ df$Amount_Paid_Group <- cut(df$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
353
+
354
+ cut_points <- c(0, 100, 200, 300, Inf)
355
+ group_names <- c('<100', '100-200', '200-300', '300+')
356
+ df$Vehicle_Speed_Group <- cut(df$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
357
+ divide <- function(x) {
358
+ ifelse(is.na(x) | x == 0,
359
+ 0,
360
+ round(df$Amount_paid / x, digits = 3)
361
+ )
362
+ }
363
+ df$Transaction_Amount_Ratio <- divide(df$Transaction_Amount)
364
+ df <- separate(df, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
365
+ df$Fraud_Number <- ifelse(df$Fraud_indicator == 'Fraud', 1, 0)
366
+ df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
367
+ df$weekend <- ifelse(weekdays(df$Date) %in% c("Saturday", "Sunday"), 1, 0)
368
+ df$weekdays <- weekdays(df$Date)
369
+ df$Time <- as.POSIXct(df$Time, format = "%H:%M")
370
+ df$clock <- ifelse(hour(df$Time) >= 12 & hour(df$Time) < 24, 1, 0)
371
+ df <- df[, !colnames(df) %in% c("Time1")]
372
+
373
+ return(df)
374
+ }
375
+
376
+ # Function 2:
377
+ # Let's create model deployment function
378
+ model_deployment_function <- function(df, model_file){
379
+ df <- df[, !colnames(df) %in% c("Time","Fraud_indicator","Date","Longitude","Latitude","Vehicle_Plate_Number","Transaction_ID",'weekend',"weekdayscümə axşamı",'weekdaysçərşənbə axşamı','weekdaysşənbə')]
380
+
381
+ cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions",
382
+ "Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
383
+
384
+ df_dummies <- df %>%
385
+ select(all_of(cols_to_dummy)) %>%
386
+ model.matrix(~ . - 1, data = .) %>%
387
+ as.data.frame()
388
+
389
+ df <- bind_cols(df %>% select(-all_of(cols_to_dummy)), df_dummies)
390
+ df_main <<- df
391
+ set.seed(123)
392
+ trainIndex <- createDataPartition(df$Fraud_Number, p = 0.8, list = FALSE)
393
+ trainData <- df[trainIndex,]
394
+ testData <- df[-trainIndex,]
395
+
396
+ trainInput <- trainData[, !colnames(trainData) %in% c('Fraud_Number')]
397
+ trainTarget <- trainData$Fraud_Number
398
+
399
+ rf_model <- randomForest(trainInput, trainTarget, ntree = 100, mtry = 3, importance = TRUE)
400
+
401
+ testInput <- testData[, !colnames(testData) %in% c('Fraud_Number')]
402
+ testTarget <- testData$Fraud_Number
403
+
404
+ predictions <- predict(rf_model, testInput)
405
+ binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
406
+
407
+ # Let's evaluate model
408
+ binary_predictions <- factor(binary_predictions, levels = c(0, 1))
409
+ testTarget <- factor(testTarget, levels = c(0, 1))
410
+ conf_matrix <- confusionMatrix(binary_predictions, testTarget)
411
+ print(conf_matrix)
412
+
413
+ # Save the model
414
+ saveRDS(rf_model, model_file)
415
+
416
+ return(df)
417
+ }
418
+
419
+
420
+
421
+ # Let's applying the model_function to the dataframe
422
+ df_processed <- data_preprocessing_function(df)
423
+
424
+ # Let's create model and save model file as rds
425
+ model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
426
+ processed_df <- model_deployment_function(df_processed, model_file)
427
+
428
+
429
+
430
+ ## 6) Model Deployment
431
+
432
+ # Load the saved random forest model
433
+ load_model_function <- function(model_file) {
434
+ rf_model <- readRDS(model_file)
435
+ return(rf_model)
436
+ }
437
+
438
+ # Function to make predictions using the loaded model
439
+ predict_with_model <- function(rf_model, new_data) {
440
+ # Preprocess new data
441
+ new_data <- separate(new_data, Timestamp, into = c("Date", "Time"), sep = " ")
442
+ new_data$Vehicle_Type <- str_to_title(new_data$Vehicle_Type)
443
+ new_data <- new_data[, !colnames(new_data) %in% c('FastagID')]
444
+
445
+ group <- function(x) {
446
+ ifelse(x < 100, '<100',
447
+ ifelse(x < 200, '100-200',
448
+ ifelse(x < 300, '200-300', '300+')))
449
+ }
450
+ new_data$Transaction_Amount_Group <- group(new_data$Transaction_Amount)
451
+
452
+ cut_points <- c(0, 100, 200, 300, Inf)
453
+ group_names <- c('<100', '100-200', '200-300', '300+')
454
+
455
+ new_data$Amount_Paid_Group <- cut(new_data$Amount_paid, breaks = cut_points, labels = group_names, include.lowest = TRUE)
456
+ new_data$Vehicle_Speed_Group <- cut(new_data$Vehicle_Speed, breaks = cut_points, labels = group_names, include.lowest = TRUE)
457
+
458
+ divide <- function(x) {
459
+ ifelse(is.na(x) | x == 0, 0, round(new_data$Amount_paid / x, digits = 3))
460
+ }
461
+ new_data$Transaction_Amount_Ratio <- divide(new_data$Transaction_Amount)
462
+
463
+ new_data <- separate(new_data, Geographical_Location, into = c("Longitude", "Latitude"), sep = " ")
464
+ new_data$Date <- as.Date(new_data$Date, format = "%m/%d/%Y")
465
+ new_data$weekend <- ifelse(weekdays(new_data$Date) %in% c("Saturday", "Sunday"), 1, 0)
466
+ new_data$weekdays <- weekdays(new_data$Date)
467
+ new_data$Time <- as.POSIXct(new_data$Time, format = "%H:%M")
468
+ new_data$clock <- ifelse(hour(new_data$Time) >= 12 & hour(new_data$Time) < 24, 1, 0)
469
+
470
+ new_data <- new_data[, !colnames(new_data) %in% c("Time1", "Time", "Fraud_indicator", "Date", "Longitude", "Latitude", "Vehicle_Plate_Number", "Transaction_ID", "weekend", "weekdayscümə axşamı", "weekdaysçərşənbə axşamı", "weekdaysşənbə")]
471
+
472
+ cols_to_dummy <- c("Vehicle_Type", "Lane_Type", "TollBoothID", "Vehicle_Dimensions", "Transaction_Amount_Group", "Amount_Paid_Group", "Vehicle_Speed_Group", "weekdays")
473
+
474
+ # Ensure each categorical variable has at least two levels
475
+ for (col in cols_to_dummy) {
476
+ if (length(unique(new_data[[col]])) < 2) {
477
+ new_data[[col]] <- factor(new_data[[col]], levels = c(unique(new_data[[col]]), "dummy_level"))
478
+ }
479
+ }
480
+
481
+ new_data_dummies <- new_data %>%
482
+ select(all_of(cols_to_dummy)) %>%
483
+ model.matrix(~ . - 1, data = .) %>%
484
+ as.data.frame()
485
+
486
+ new_data <- bind_cols(new_data %>% select(-all_of(cols_to_dummy)), new_data_dummies)
487
+
488
+ # List of columns to be checked and added if not present
489
+ cols_to_add <- c(
490
+ "Transaction_Amount", "Amount_paid", "Vehicle_Speed", "Transaction_Amount_Ratio",
491
+ "clock", "Vehicle_TypeBus ", "Vehicle_TypeCar", "Vehicle_TypeMotorcycle",
492
+ "Vehicle_TypeSedan", "Vehicle_TypeSuv", "Vehicle_TypeTruck", "Vehicle_TypeVan",
493
+ "Lane_TypeRegular", "TollBoothIDB-102", "TollBoothIDC-103", "TollBoothIDD-104",
494
+ "TollBoothIDD-105", "TollBoothIDD-106", "Vehicle_DimensionsMedium",
495
+ "Vehicle_DimensionsSmall", "Transaction_Amount_Group100-200",
496
+ "Transaction_Amount_Group200-300", "Transaction_Amount_Group300+",
497
+ "Amount_Paid_Group100-200", "Amount_Paid_Group200-300", "Amount_Paid_Group300+",
498
+ "Vehicle_Speed_Group100-200", "Vehicle_Speed_Group200-300", "Vehicle_Speed_Group300+",
499
+ "weekdaysbazar ertəsi", "weekdayscümə", "weekdaysçərşənbə"
500
+ )
501
+
502
+ # Add missing columns with value 0
503
+ missing_cols <- setdiff(cols_to_add, colnames(new_data))
504
+ if (length(missing_cols) > 0) {
505
+ new_data[, missing_cols] <- 0
506
+ }
507
+
508
+ # Select input features
509
+ new_input <- new_data[, cols_to_add]
510
+
511
+ # Make predictions
512
+ predictions <- predict(rf_model, new_input)
513
+ binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
514
+
515
+ return(binary_predictions)
516
+ }
517
+
518
+
519
+
520
+ # Example usage:
521
+ model_file <- "C:/Users/HP/OneDrive/İş masası/R Programming/rf_model.rds"
522
+ rf_model <- load_model_function(model_file)
523
+
524
+ # Example new data
525
+ new_data <- data.frame(
526
+ Transaction_ID = 1,
527
+ Timestamp = c("1/6/2023 11:20"),
528
+ Vehicle_Type = c("Car"),
529
+ FastagID = c("12345"),
530
+ TollBoothID = c("A-101"),
531
+ Lane_Type = c("Express"),
532
+ Vehicle_Dimensions = c("Medium"),
533
+ Transaction_Amount = c(150),
534
+ Amount_paid = c(110),
535
+ Geographical_Location = c("34.0522118, 40.7128"),
536
+ Vehicle_Speed = c(60),
537
+ Vehicle_Plate_Number = c("ABC123")
538
+ )
539
+
540
+ # Make predictions
541
+ predictions <- predict_with_model(rf_model, new_data)
542
+ print(predictions)
543
+
544
+ View(predictions)
545
+
546
+ colnames(df_main)