add OTS project
Browse files- .DS_Store +0 -0
- BERT/.DS_Store +0 -0
- BERT/tests/stressTest_500.py +58 -0
- BERT/tests/test_sms.py +57 -0
- BERT/training/.DS_Store +0 -0
- BERT/training/bert_sms_spam_phishing_model/config.json +37 -0
- BERT/training/bert_sms_spam_phishing_model/model.safetensors +3 -0
- BERT/training/phishing_urls.xlsx +0 -0
- BERT/training/train.py +126 -0
- FastText/.DS_Store +0 -0
- FastText/tests/stressTest_500.py +38 -0
- FastText/training/ots_sms_model_v1.1.bin +3 -0
- FastText/training/test_sms.py +53 -0
- FastText/training/train.py +23 -0
- README.md +83 -3
- api-interface/README.md +83 -0
- api-interface/app.py +139 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
BERT/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
BERT/tests/stressTest_500.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
3 |
+
import time
|
4 |
+
import random
|
5 |
+
|
6 |
+
def load_model(model_path):
|
7 |
+
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3, local_files_only=True)
|
8 |
+
return model
|
9 |
+
|
10 |
+
def preprocess_text(text, tokenizer, max_len=128):
|
11 |
+
encoding = tokenizer.encode_plus(
|
12 |
+
text,
|
13 |
+
add_special_tokens=True,
|
14 |
+
max_length=max_len,
|
15 |
+
return_token_type_ids=False,
|
16 |
+
padding='max_length',
|
17 |
+
return_attention_mask=True,
|
18 |
+
return_tensors='pt',
|
19 |
+
truncation=True
|
20 |
+
)
|
21 |
+
return encoding
|
22 |
+
|
23 |
+
def predict(text, model, tokenizer):
|
24 |
+
model.eval()
|
25 |
+
with torch.no_grad():
|
26 |
+
inputs = preprocess_text(text, tokenizer)
|
27 |
+
input_ids = inputs['input_ids']
|
28 |
+
attention_mask = inputs['attention_mask']
|
29 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
30 |
+
prediction = torch.argmax(outputs.logits, dim=1).item()
|
31 |
+
return prediction
|
32 |
+
|
33 |
+
def generate_random_text(base_text, index):
|
34 |
+
return f"{base_text} - Message {index} - Random {random.randint(1, 10000)}"
|
35 |
+
|
36 |
+
def main():
|
37 |
+
model_path = '/Users/ameedjamous/programming/OpenTextShield/src/BERT/training/bert_sms_spam_phishing_model'
|
38 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
39 |
+
model = load_model(model_path)
|
40 |
+
|
41 |
+
# Generate unique sample texts
|
42 |
+
base_text = "Sample SMS text"
|
43 |
+
sample_texts = [generate_random_text(base_text, i) for i in range(500)]
|
44 |
+
|
45 |
+
# Stress test with progress logging
|
46 |
+
start_time = time.time()
|
47 |
+
|
48 |
+
for i, text in enumerate(sample_texts):
|
49 |
+
predict(text, model, tokenizer)
|
50 |
+
if (i + 1) % 50 == 0:
|
51 |
+
print(f"Processed {i + 1} messages...")
|
52 |
+
|
53 |
+
end_time = time.time()
|
54 |
+
total_time = end_time - start_time
|
55 |
+
print(f"Processed 500 messages in {total_time:.2f} seconds")
|
56 |
+
|
57 |
+
if __name__ == '__main__':
|
58 |
+
main()
|
BERT/tests/test_sms.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
3 |
+
import time
|
4 |
+
|
5 |
+
def load_model(model_path):
|
6 |
+
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3, local_files_only=True)
|
7 |
+
return model
|
8 |
+
|
9 |
+
def preprocess_text(text, tokenizer, max_len=128):
|
10 |
+
encoding = tokenizer.encode_plus(
|
11 |
+
text,
|
12 |
+
add_special_tokens=True,
|
13 |
+
max_length=max_len,
|
14 |
+
return_token_type_ids=False,
|
15 |
+
padding='max_length',
|
16 |
+
return_attention_mask=True,
|
17 |
+
return_tensors='pt',
|
18 |
+
truncation=True
|
19 |
+
)
|
20 |
+
return encoding
|
21 |
+
|
22 |
+
def predict(text, model, tokenizer):
|
23 |
+
start_time = time.time()
|
24 |
+
model.eval()
|
25 |
+
with torch.no_grad():
|
26 |
+
inputs = preprocess_text(text, tokenizer)
|
27 |
+
input_ids = inputs['input_ids']
|
28 |
+
attention_mask = inputs['attention_mask']
|
29 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
30 |
+
prediction = torch.argmax(outputs.logits, dim=1).item()
|
31 |
+
end_time = time.time()
|
32 |
+
processing_time = end_time - start_time
|
33 |
+
return prediction, processing_time
|
34 |
+
|
35 |
+
def main():
|
36 |
+
model_path = '/Users/ameedjamous/programming/OpenTextShield/src/BERT/training/bert_sms_spam_phishing_model'
|
37 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
38 |
+
|
39 |
+
# Load the trained model
|
40 |
+
model = load_model(model_path)
|
41 |
+
|
42 |
+
# Sample text to classify
|
43 |
+
sample_text = "Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's, Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's, Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 3838329832983092823098320983209823902389028239038329083290. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's"
|
44 |
+
|
45 |
+
# Get prediction and processing time
|
46 |
+
prediction, processing_time = predict(sample_text, model, tokenizer)
|
47 |
+
|
48 |
+
# Convert numerical prediction back to label
|
49 |
+
label_map = {0: 'ham', 1: 'spam', 2: 'phishing'}
|
50 |
+
print(f"The provided text is predicted as: {label_map[prediction]}")
|
51 |
+
|
52 |
+
# Determine the emoji based on processing time
|
53 |
+
emoji = "😊" if processing_time <= 0.2 else "😔"
|
54 |
+
print(f"Processing time: {processing_time:.2f} seconds {emoji}")
|
55 |
+
|
56 |
+
if __name__ == '__main__':
|
57 |
+
main()
|
BERT/training/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
BERT/training/bert_sms_spam_phishing_model/config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0",
|
14 |
+
"1": "LABEL_1",
|
15 |
+
"2": "LABEL_2"
|
16 |
+
},
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 3072,
|
19 |
+
"label2id": {
|
20 |
+
"LABEL_0": 0,
|
21 |
+
"LABEL_1": 1,
|
22 |
+
"LABEL_2": 2
|
23 |
+
},
|
24 |
+
"layer_norm_eps": 1e-12,
|
25 |
+
"max_position_embeddings": 512,
|
26 |
+
"model_type": "bert",
|
27 |
+
"num_attention_heads": 12,
|
28 |
+
"num_hidden_layers": 12,
|
29 |
+
"pad_token_id": 0,
|
30 |
+
"position_embedding_type": "absolute",
|
31 |
+
"problem_type": "single_label_classification",
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.36.2",
|
34 |
+
"type_vocab_size": 2,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 30522
|
37 |
+
}
|
BERT/training/bert_sms_spam_phishing_model/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a5868743b6b8725be1192dfaba15ff7677d5a70525bbf83443ce5001f893390
|
3 |
+
size 437961724
|
BERT/training/phishing_urls.xlsx
ADDED
Binary file (30.6 kB). View file
|
|
BERT/training/train.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import DataLoader, Dataset
|
3 |
+
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.metrics import accuracy_score
|
6 |
+
import pandas as pd
|
7 |
+
import chardet
|
8 |
+
|
9 |
+
# Dataset Class
|
10 |
+
class TextDataset(Dataset):
|
11 |
+
def __init__(self, texts, labels, tokenizer, max_len):
|
12 |
+
self.texts = texts
|
13 |
+
self.labels = labels
|
14 |
+
self.tokenizer = tokenizer
|
15 |
+
self.max_len = max_len
|
16 |
+
|
17 |
+
def __len__(self):
|
18 |
+
return len(self.texts)
|
19 |
+
|
20 |
+
def __getitem__(self, item):
|
21 |
+
text = str(self.texts[item])
|
22 |
+
label = self.labels[item]
|
23 |
+
|
24 |
+
encoding = self.tokenizer.encode_plus(
|
25 |
+
text,
|
26 |
+
add_special_tokens=True,
|
27 |
+
max_length=self.max_len,
|
28 |
+
return_token_type_ids=False,
|
29 |
+
padding='max_length',
|
30 |
+
return_attention_mask=True,
|
31 |
+
return_tensors='pt',
|
32 |
+
truncation=True
|
33 |
+
)
|
34 |
+
|
35 |
+
return {
|
36 |
+
'text': text,
|
37 |
+
'input_ids': encoding['input_ids'].flatten(),
|
38 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
39 |
+
'labels': torch.tensor(label, dtype=torch.long)
|
40 |
+
}
|
41 |
+
|
42 |
+
# Data Loader Function
|
43 |
+
def create_data_loader(df, tokenizer, max_len, batch_size):
|
44 |
+
ds = TextDataset(
|
45 |
+
texts=df.text.to_numpy(),
|
46 |
+
labels=df.label.to_numpy(),
|
47 |
+
tokenizer=tokenizer,
|
48 |
+
max_len=max_len
|
49 |
+
)
|
50 |
+
|
51 |
+
return DataLoader(ds, batch_size=batch_size, num_workers=4)
|
52 |
+
|
53 |
+
def main():
|
54 |
+
# Detect encoding
|
55 |
+
with open('sms_spam_phishing_dataset.csv', 'rb') as f:
|
56 |
+
result = chardet.detect(f.read())
|
57 |
+
file_encoding = result['encoding']
|
58 |
+
|
59 |
+
print("Detected encoding:", file_encoding)
|
60 |
+
|
61 |
+
# Load Dataset
|
62 |
+
df = pd.read_csv('sms_spam_phishing_dataset.csv', encoding=file_encoding)
|
63 |
+
df['label'] = df['label'].map({'ham': 0, 'spam': 1, 'phishing': 2}) # Convert labels to numerical
|
64 |
+
|
65 |
+
# Parameters
|
66 |
+
BATCH_SIZE = 16
|
67 |
+
MAX_LEN = 128
|
68 |
+
EPOCHS = 3
|
69 |
+
|
70 |
+
# Split Data
|
71 |
+
train_df, test_df = train_test_split(df, test_size=0.1)
|
72 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
73 |
+
|
74 |
+
# Create Data Loaders
|
75 |
+
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
|
76 |
+
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
|
77 |
+
|
78 |
+
# Load BERT Model
|
79 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
|
80 |
+
|
81 |
+
# Optimizer
|
82 |
+
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
|
83 |
+
|
84 |
+
# Device
|
85 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
86 |
+
model = model.to(device)
|
87 |
+
|
88 |
+
# Training Loop
|
89 |
+
for epoch in range(EPOCHS):
|
90 |
+
model.train()
|
91 |
+
for batch in train_data_loader:
|
92 |
+
input_ids = batch['input_ids'].to(device)
|
93 |
+
attention_mask = batch['attention_mask'].to(device)
|
94 |
+
labels = batch['labels'].to(device)
|
95 |
+
|
96 |
+
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
|
97 |
+
loss = outputs.loss
|
98 |
+
loss.backward()
|
99 |
+
optimizer.step()
|
100 |
+
optimizer.zero_grad()
|
101 |
+
|
102 |
+
print(f"Epoch {epoch + 1}/{EPOCHS} completed.")
|
103 |
+
|
104 |
+
# Evaluate
|
105 |
+
model.eval()
|
106 |
+
predictions, true_labels = [], []
|
107 |
+
for batch in test_data_loader:
|
108 |
+
input_ids = batch['input_ids'].to(device)
|
109 |
+
attention_mask = batch['attention_mask'].to(device)
|
110 |
+
labels = batch['labels'].to(device)
|
111 |
+
|
112 |
+
with torch.no_grad():
|
113 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
114 |
+
|
115 |
+
logits = outputs.logits
|
116 |
+
predictions.extend(torch.argmax(logits, dim=1).tolist())
|
117 |
+
true_labels.extend(labels.tolist())
|
118 |
+
|
119 |
+
accuracy = accuracy_score(true_labels, predictions)
|
120 |
+
print(f"Test Accuracy: {accuracy * 100:.2f}%")
|
121 |
+
|
122 |
+
# Save the Model
|
123 |
+
model.save_pretrained('bert_sms_spam_phishing_model')
|
124 |
+
|
125 |
+
if __name__ == '__main__':
|
126 |
+
main()
|
FastText/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
FastText/tests/stressTest_500.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import concurrent.futures
|
3 |
+
import time
|
4 |
+
|
5 |
+
# API endpoint and headers
|
6 |
+
url = 'http://localhost:8000/predict/'
|
7 |
+
headers = {
|
8 |
+
'accept': 'application/json',
|
9 |
+
'Content-Type': 'application/json',
|
10 |
+
}
|
11 |
+
data = '{"text":"Hello World"}'
|
12 |
+
|
13 |
+
# Function to send a single API request
|
14 |
+
def send_request():
|
15 |
+
try:
|
16 |
+
response = requests.post(url, headers=headers, data=data)
|
17 |
+
return response.status_code
|
18 |
+
except requests.exceptions.RequestException as e:
|
19 |
+
return str(e)
|
20 |
+
|
21 |
+
# Function to send 500 requests and measure the time taken
|
22 |
+
def send_500_requests_and_measure_time():
|
23 |
+
start_time = time.time() # Start time
|
24 |
+
|
25 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
26 |
+
# Creating a list of 500 future objects
|
27 |
+
futures = [executor.submit(send_request) for _ in range(500)]
|
28 |
+
# Waiting for all the futures to complete
|
29 |
+
concurrent.futures.wait(futures)
|
30 |
+
|
31 |
+
end_time = time.time() # End time
|
32 |
+
|
33 |
+
duration = end_time - start_time
|
34 |
+
print(f"Completed sending 500 requests in {duration:.2f} seconds")
|
35 |
+
|
36 |
+
# Send 500 requests and measure time
|
37 |
+
send_500_requests_and_measure_time()
|
38 |
+
|
FastText/training/ots_sms_model_v1.1.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed771b7699999fdc20c9cac7aa3f8dfd29b409ea826925f3a715e9c1a37b6abd
|
3 |
+
size 810920792
|
FastText/training/test_sms.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fasttext
|
2 |
+
|
3 |
+
# Load the trained model
|
4 |
+
model = fasttext.load_model('ots_sms_model_v1.1.bin')
|
5 |
+
|
6 |
+
# Example SMS message
|
7 |
+
|
8 |
+
# Phishing Examples
|
9 |
+
# message = "URGENT: Your bank account has been compromised. To secure your funds, click here immediately: http://bit.ly/2FAKEurl"
|
10 |
+
# message = "Warning: Your email has been flagged for unusual activity. Confirm your credentials now at http://bit.ly/com32323 to avoid suspension."
|
11 |
+
message = "Paket im Terminal 2: http://different.com/7DH5000022" ## phishing
|
12 |
+
|
13 |
+
# Verification Code Examples
|
14 |
+
# message = "Google Verification Code: 452183. Do not share this code with anyone."
|
15 |
+
# message = "Your WhatsApp code: 729-113. Don't share this code with others."
|
16 |
+
|
17 |
+
# Spam Examples
|
18 |
+
# message = "Congratulations! You've won a $500 Amazon gift card. Claim now at www.win-gifts.com. Hurry, offer ends soon!"
|
19 |
+
# message = "Exclusive Offer: Get 90% off on your next purchase at SuperShoes. Visit www.someshoes.com/offer10. Unsubscribe at stopDIA.com"
|
20 |
+
|
21 |
+
# Regular Communication (Ham) Examples
|
22 |
+
# message = "Hey, are we still on for dinner tonight at 7?"
|
23 |
+
# message = "Can you send me the presentation slides? I'd like to review them before the meeting."
|
24 |
+
|
25 |
+
# More Phishing Examples
|
26 |
+
# message = "Final Notice: Your subscription with NetMedia is about to expire. Renew now to avoid service interruption: www.netmedia-renewal.com"
|
27 |
+
# message = "Your package could not be delivered due to unpaid customs fee. Pay now at www.other_domain.com/438484sjdjdjdsjjdsjdsjdjsjdsjdjsdjs"
|
28 |
+
|
29 |
+
# More Spam Examples
|
30 |
+
# message = "Hot Summers Sale! Buy one get one free on all beachwear at TrendyStyles. Shop now at www.trendystyles.com/sale"
|
31 |
+
# message = "Get a free cell data booster with every new phone plan at MobileNet. Call us at 800-555-0199 or visit www.mobilenet.com"
|
32 |
+
|
33 |
+
# More Verification Code Examples
|
34 |
+
# message = "Your Tinder verification code is 394857. Please enter this code to continue."
|
35 |
+
# message = "Dropbox: Your security code is 842159. Enter this code to complete the setup."
|
36 |
+
|
37 |
+
# More Regular Communication (Ham) Examples
|
38 |
+
# message = "Reminder: Your dentist appointment is scheduled for tomorrow at 10 AM."
|
39 |
+
# message = "Great meeting today! Let's catch up next week to discuss further steps."
|
40 |
+
|
41 |
+
# Test Data that the model have never seen before with type Phishing
|
42 |
+
|
43 |
+
# message = "ALERT: Unusual sign-in detected on your Netflix account. Verify immediately at http://netflix-verify-login.com to prevent suspension."
|
44 |
+
# message = "Your PayPal account has been temporarily restricted. Please update your information at http://paypal-secure-update.com to restore access."
|
45 |
+
# message = "Your Apple ID is due for verification. Failure to verify may lead to account suspension. Visit http://appleids-verify-n0w.com promptly."
|
46 |
+
message = "Security Alert: We've detected unusual activity on your Apple account. Please verify your information immediately at http://apple-resetpassword.com to avoid suspension."
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
# Predict the label
|
51 |
+
label, probability = model.predict(message)
|
52 |
+
|
53 |
+
print(f"Label: {label[0]}, Probability: {probability[0]}")
|
FastText/training/train.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fasttext
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
# Try different encodings if UTF-8 does not work
|
6 |
+
try:
|
7 |
+
data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='utf-8')
|
8 |
+
except UnicodeDecodeError:
|
9 |
+
data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='ISO-8859-1') # Try latin1 encoding
|
10 |
+
|
11 |
+
|
12 |
+
# Preprocess data: format as fastText expects (each line: "__label__<label> <text>")
|
13 |
+
data['ft_format'] = data.apply(lambda row: f'__label__{row["Label"]} {row["Message"]}', axis=1)
|
14 |
+
|
15 |
+
# Save preprocessed data
|
16 |
+
data['ft_format'].to_csv('ft_data.txt', index=False, header=False)
|
17 |
+
|
18 |
+
# Train a supervised model
|
19 |
+
model = fasttext.train_supervised(input='ft_data.txt', epoch=25, lr=1.0, wordNgrams=2)
|
20 |
+
|
21 |
+
# Save the model
|
22 |
+
model.save_model('ots_sms_model_v1.1.bin')
|
23 |
+
|
README.md
CHANGED
@@ -1,3 +1,83 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## What is Open Source Text Shield (OTS)?
|
2 |
+
|
3 |
+
OTS (Open Source Text Shield) is an AI-driven solution designed to enhance the security of telecom networks by detecting and filtering spam and phishing messages in real time. This application leverages both BERT and FastText models for efficient text classification.
|
4 |
+
|
5 |
+
## Getting Started
|
6 |
+
|
7 |
+
### Prerequisites
|
8 |
+
|
9 |
+
- Python 3.8 or later
|
10 |
+
- FastAPI
|
11 |
+
- pydantic
|
12 |
+
- torch
|
13 |
+
- transformers
|
14 |
+
- fasttext
|
15 |
+
|
16 |
+
You can install the necessary libraries using pip:
|
17 |
+
|
18 |
+
```bash
|
19 |
+
pip install fastapi pydantic torch transformers fasttext
|
20 |
+
```
|
21 |
+
|
22 |
+
### Installation
|
23 |
+
|
24 |
+
Clone the repository to your local machine:
|
25 |
+
|
26 |
+
```bash
|
27 |
+
git clone https://github.com/TelecomsXChangeAPi/OpenTextShield/
|
28 |
+
|
29 |
+
```
|
30 |
+
|
31 |
+
Navigate to the cloned directory:
|
32 |
+
|
33 |
+
```bash
|
34 |
+
cd OpenTextShield
|
35 |
+
```
|
36 |
+
|
37 |
+
### Running the Application
|
38 |
+
|
39 |
+
Start the server by running:
|
40 |
+
|
41 |
+
```bash
|
42 |
+
uvicorn main:app --host 0.0.0.0 --port 8001
|
43 |
+
```
|
44 |
+
|
45 |
+
The application will be available at `http://localhost:8001`.
|
46 |
+
|
47 |
+
### Usage
|
48 |
+
|
49 |
+
#### Predicting SMS
|
50 |
+
|
51 |
+
To predict if an SMS is spam, phishing, or ham (regular message), send a POST request to `/predict/` with a JSON body containing the SMS text and the model to use (`bert` or `fasttext`).
|
52 |
+
|
53 |
+
Example using curl:
|
54 |
+
|
55 |
+
```bash
|
56 |
+
curl -X POST "http://localhost:8001/predict/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"text\":\"Your SMS content here\",\"model\":\"bert\"}"
|
57 |
+
```
|
58 |
+
|
59 |
+
#### Feedback Loop
|
60 |
+
|
61 |
+
To provide feedback on predictions, send a POST request to `/feedback-loop/` with relevant feedback data.
|
62 |
+
|
63 |
+
Example using curl:
|
64 |
+
|
65 |
+
```bash
|
66 |
+
curl -X POST "http://localhost:8001/feedback-loop/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"content\":\"SMS content\",\"feedback\":\"Your feedback here\",\"thumbs_up\":true,\"thumbs_down\":false,\"user_id\":\"user123\",\"model\":\"bert\"}"
|
67 |
+
```
|
68 |
+
|
69 |
+
#### Download Feedback
|
70 |
+
|
71 |
+
To download the feedback data for a specific model, send a GET request to `/download-feedback/{model_name}`.
|
72 |
+
|
73 |
+
Example using curl:
|
74 |
+
|
75 |
+
```bash
|
76 |
+
curl -X GET "http://localhost:8001/download-feedback/bert"
|
77 |
+
```
|
78 |
+
|
79 |
+
|
80 |
+
## Acknowledgements
|
81 |
+
|
82 |
+
Special thanks to the team at TelecomsXChange (TCXC) for their invaluable contributions to this project.
|
83 |
+
|
api-interface/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## What is Open Source Text Shield (OTS)?
|
2 |
+
|
3 |
+
OTS (Open Source Text Shield) is an AI-driven solution designed to enhance the security of telecom networks by detecting and filtering spam and phishing messages in real time. This application leverages both BERT and FastText models for efficient text classification.
|
4 |
+
|
5 |
+
## Getting Started
|
6 |
+
|
7 |
+
### Prerequisites
|
8 |
+
|
9 |
+
- Python 3.8 or later
|
10 |
+
- FastAPI
|
11 |
+
- pydantic
|
12 |
+
- torch
|
13 |
+
- transformers
|
14 |
+
- fasttext
|
15 |
+
|
16 |
+
You can install the necessary libraries using pip:
|
17 |
+
|
18 |
+
```bash
|
19 |
+
pip install fastapi pydantic torch transformers fasttext
|
20 |
+
```
|
21 |
+
|
22 |
+
### Installation
|
23 |
+
|
24 |
+
Clone the repository to your local machine:
|
25 |
+
|
26 |
+
```bash
|
27 |
+
git clone https://github.com/TelecomsXChangeAPi/OpenTextShield/
|
28 |
+
|
29 |
+
```
|
30 |
+
|
31 |
+
Navigate to the cloned directory:
|
32 |
+
|
33 |
+
```bash
|
34 |
+
cd OpenTextShield
|
35 |
+
```
|
36 |
+
|
37 |
+
### Running the Application
|
38 |
+
|
39 |
+
Start the server by running:
|
40 |
+
|
41 |
+
```bash
|
42 |
+
uvicorn main:app --host 0.0.0.0 --port 8001
|
43 |
+
```
|
44 |
+
|
45 |
+
The application will be available at `http://localhost:8001`.
|
46 |
+
|
47 |
+
### Usage
|
48 |
+
|
49 |
+
#### Predicting SMS
|
50 |
+
|
51 |
+
To predict if an SMS is spam, phishing, or ham (regular message), send a POST request to `/predict/` with a JSON body containing the SMS text and the model to use (`bert` or `fasttext`).
|
52 |
+
|
53 |
+
Example using curl:
|
54 |
+
|
55 |
+
```bash
|
56 |
+
curl -X POST "http://localhost:8001/predict/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"text\":\"Your SMS content here\",\"model\":\"bert\"}"
|
57 |
+
```
|
58 |
+
|
59 |
+
#### Feedback Loop
|
60 |
+
|
61 |
+
To provide feedback on predictions, send a POST request to `/feedback-loop/` with relevant feedback data.
|
62 |
+
|
63 |
+
Example using curl:
|
64 |
+
|
65 |
+
```bash
|
66 |
+
curl -X POST "http://localhost:8001/feedback-loop/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"content\":\"SMS content\",\"feedback\":\"Your feedback here\",\"thumbs_up\":true,\"thumbs_down\":false,\"user_id\":\"user123\",\"model\":\"bert\"}"
|
67 |
+
```
|
68 |
+
|
69 |
+
#### Download Feedback
|
70 |
+
|
71 |
+
To download the feedback data for a specific model, send a GET request to `/download-feedback/{model_name}`.
|
72 |
+
|
73 |
+
Example using curl:
|
74 |
+
|
75 |
+
```bash
|
76 |
+
curl -X GET "http://localhost:8001/download-feedback/bert"
|
77 |
+
```
|
78 |
+
|
79 |
+
|
80 |
+
## Acknowledgements
|
81 |
+
|
82 |
+
Special thanks to the team at TelecomsXChange (TCXC) for their invaluable contributions to this project.
|
83 |
+
|
api-interface/app.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from typing import Optional
|
3 |
+
from fastapi import FastAPI, HTTPException, Request, Depends
|
4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from datetime import datetime
|
7 |
+
from fastapi.responses import FileResponse
|
8 |
+
import torch
|
9 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
10 |
+
import fasttext
|
11 |
+
import csv
|
12 |
+
|
13 |
+
app = FastAPI()
|
14 |
+
|
15 |
+
# Allowed SMPP, SMSC or any External IP addresses
|
16 |
+
ALLOWED_IPS = {"127.0.0.1", "localhost", "10.0.0.1"}
|
17 |
+
|
18 |
+
# Add CORSMiddleware to allow cross-origin requests
|
19 |
+
app.add_middleware(
|
20 |
+
CORSMiddleware,
|
21 |
+
allow_origins=["*"],
|
22 |
+
allow_credentials=True,
|
23 |
+
allow_methods=["*"],
|
24 |
+
allow_headers=["*"],
|
25 |
+
)
|
26 |
+
|
27 |
+
|
28 |
+
# Load BERT model
|
29 |
+
bert_model_path = "../BERT/training/bert_sms_spam_phishing_model"
|
30 |
+
bert_model = BertForSequenceClassification.from_pretrained(bert_model_path)
|
31 |
+
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
32 |
+
bert_model.eval()
|
33 |
+
|
34 |
+
# Load FastText model
|
35 |
+
fasttext_model_path = "../FastText/training/ots_sms_model_v1.1.bin"
|
36 |
+
fasttext_model = fasttext.load_model(fasttext_model_path)
|
37 |
+
|
38 |
+
class SMS(BaseModel):
|
39 |
+
text: str
|
40 |
+
model: str # "bert" or "fasttext"
|
41 |
+
|
42 |
+
class Feedback(BaseModel):
|
43 |
+
content: str
|
44 |
+
feedback: str
|
45 |
+
thumbs_up: bool
|
46 |
+
thumbs_down: bool
|
47 |
+
user_id: Optional[str] = None
|
48 |
+
model: str # "bert" or "fasttext"
|
49 |
+
|
50 |
+
def preprocess_text(text, tokenizer, max_len=128):
|
51 |
+
return tokenizer.encode_plus(
|
52 |
+
text, add_special_tokens=True, max_length=max_len,
|
53 |
+
padding='max_length', return_attention_mask=True,
|
54 |
+
return_tensors='pt', truncation=True
|
55 |
+
)
|
56 |
+
def write_feedback(feedback_data, model_name):
|
57 |
+
file_name = f"feedback_{model_name}.csv"
|
58 |
+
with open(file_name, mode="a", newline="", encoding="utf-8") as file:
|
59 |
+
writer = csv.writer(file)
|
60 |
+
if file.tell() == 0:
|
61 |
+
writer.writerow(["Timestamp", "UserID", "Content", "Feedback", "Thumbs Up", "Thumbs Down"])
|
62 |
+
writer.writerow(feedback_data)
|
63 |
+
|
64 |
+
def verify_ip_address(request: Request):
|
65 |
+
client_host = request.client.host
|
66 |
+
if client_host not in ALLOWED_IPS:
|
67 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
68 |
+
return client_host
|
69 |
+
|
70 |
+
|
71 |
+
# Route to predict SMS using specified model - supported "bert" , "fasttext"
|
72 |
+
|
73 |
+
@app.post("/predict/", dependencies=[Depends(verify_ip_address)])
|
74 |
+
async def predict_sms(sms: SMS):
|
75 |
+
start_time = time.time()
|
76 |
+
|
77 |
+
if not sms.text:
|
78 |
+
raise HTTPException(status_code=400, detail="Text is empty")
|
79 |
+
|
80 |
+
if sms.model == "bert":
|
81 |
+
inputs = preprocess_text(sms.text, bert_tokenizer)
|
82 |
+
with torch.no_grad():
|
83 |
+
outputs = bert_model(**inputs)
|
84 |
+
prediction = torch.argmax(outputs.logits, dim=1).item()
|
85 |
+
label_map = {0: 'ham', 1: 'spam', 2: 'phishing'}
|
86 |
+
label = label_map[prediction]
|
87 |
+
probability = torch.nn.functional.softmax(outputs.logits, dim=1).max().item()
|
88 |
+
model_info = {"Model_Name": "OTS_bert", "Model_Version": "1.1.4"}
|
89 |
+
elif sms.model == "fasttext":
|
90 |
+
label, probability = fasttext_model.predict(sms.text, k=1) # Ensure k=1 for single label prediction
|
91 |
+
label = label[0].replace('__label__', '')
|
92 |
+
probability = probability[0] # Extract the probability value
|
93 |
+
model_info = {
|
94 |
+
"Model_Name": "OTS_fasttext",
|
95 |
+
"Model_Version": "1.1.4",
|
96 |
+
"Model_Author": "TelecomsXChange (TCXC)",
|
97 |
+
"Last_Training": "2023-12-21"
|
98 |
+
}
|
99 |
+
else:
|
100 |
+
raise HTTPException(status_code=400, detail="Invalid model type")
|
101 |
+
|
102 |
+
end_time = time.time()
|
103 |
+
return {
|
104 |
+
"label": label,
|
105 |
+
"probability": probability,
|
106 |
+
"processing_time": end_time - start_time,
|
107 |
+
**model_info,
|
108 |
+
"Model_Author": "TelecomsXChange (TCXC)",
|
109 |
+
"Last_Training": "2023-12-21" # Update accordingly
|
110 |
+
}
|
111 |
+
|
112 |
+
# Feedback loop and download feedback
|
113 |
+
|
114 |
+
@app.post("/feedback-loop/", dependencies=[Depends(verify_ip_address)])
|
115 |
+
async def feedback_loop(feedback: Feedback):
|
116 |
+
thumbs_up = 'Yes' if feedback.thumbs_up else 'No'
|
117 |
+
thumbs_down = 'Yes' if feedback.thumbs_down else 'No'
|
118 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
119 |
+
feedback_data = [timestamp, feedback.user_id, feedback.content, feedback.feedback, thumbs_up, thumbs_down]
|
120 |
+
|
121 |
+
if feedback.model in ["bert", "fasttext"]:
|
122 |
+
write_feedback(feedback_data, feedback.model)
|
123 |
+
else:
|
124 |
+
raise HTTPException(status_code=400, detail="Invalid model type")
|
125 |
+
|
126 |
+
return {"message": "Feedback received"}
|
127 |
+
|
128 |
+
|
129 |
+
@app.get("/download-feedback/{model_name}", dependencies=[Depends(verify_ip_address)])
|
130 |
+
async def download_feedback(model_name: str):
|
131 |
+
if model_name in ["bert", "fasttext"]:
|
132 |
+
file_path = f"feedback_{model_name}.csv"
|
133 |
+
else:
|
134 |
+
raise HTTPException(status_code=400, detail="Invalid model name")
|
135 |
+
return FileResponse(file_path, media_type='text/csv', filename=file_path)
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
import uvicorn
|
139 |
+
uvicorn.run(app, host="0.0.0.0", port=8001)
|