ajamous commited on
Commit
0e2fe46
1 Parent(s): 0f460da

add OTS project

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
BERT/.DS_Store ADDED
Binary file (6.15 kB). View file
 
BERT/tests/stressTest_500.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertTokenizer, BertForSequenceClassification
3
+ import time
4
+ import random
5
+
6
+ def load_model(model_path):
7
+ model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3, local_files_only=True)
8
+ return model
9
+
10
+ def preprocess_text(text, tokenizer, max_len=128):
11
+ encoding = tokenizer.encode_plus(
12
+ text,
13
+ add_special_tokens=True,
14
+ max_length=max_len,
15
+ return_token_type_ids=False,
16
+ padding='max_length',
17
+ return_attention_mask=True,
18
+ return_tensors='pt',
19
+ truncation=True
20
+ )
21
+ return encoding
22
+
23
+ def predict(text, model, tokenizer):
24
+ model.eval()
25
+ with torch.no_grad():
26
+ inputs = preprocess_text(text, tokenizer)
27
+ input_ids = inputs['input_ids']
28
+ attention_mask = inputs['attention_mask']
29
+ outputs = model(input_ids, attention_mask=attention_mask)
30
+ prediction = torch.argmax(outputs.logits, dim=1).item()
31
+ return prediction
32
+
33
+ def generate_random_text(base_text, index):
34
+ return f"{base_text} - Message {index} - Random {random.randint(1, 10000)}"
35
+
36
+ def main():
37
+ model_path = '/Users/ameedjamous/programming/OpenTextShield/src/BERT/training/bert_sms_spam_phishing_model'
38
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
39
+ model = load_model(model_path)
40
+
41
+ # Generate unique sample texts
42
+ base_text = "Sample SMS text"
43
+ sample_texts = [generate_random_text(base_text, i) for i in range(500)]
44
+
45
+ # Stress test with progress logging
46
+ start_time = time.time()
47
+
48
+ for i, text in enumerate(sample_texts):
49
+ predict(text, model, tokenizer)
50
+ if (i + 1) % 50 == 0:
51
+ print(f"Processed {i + 1} messages...")
52
+
53
+ end_time = time.time()
54
+ total_time = end_time - start_time
55
+ print(f"Processed 500 messages in {total_time:.2f} seconds")
56
+
57
+ if __name__ == '__main__':
58
+ main()
BERT/tests/test_sms.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertTokenizer, BertForSequenceClassification
3
+ import time
4
+
5
+ def load_model(model_path):
6
+ model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3, local_files_only=True)
7
+ return model
8
+
9
+ def preprocess_text(text, tokenizer, max_len=128):
10
+ encoding = tokenizer.encode_plus(
11
+ text,
12
+ add_special_tokens=True,
13
+ max_length=max_len,
14
+ return_token_type_ids=False,
15
+ padding='max_length',
16
+ return_attention_mask=True,
17
+ return_tensors='pt',
18
+ truncation=True
19
+ )
20
+ return encoding
21
+
22
+ def predict(text, model, tokenizer):
23
+ start_time = time.time()
24
+ model.eval()
25
+ with torch.no_grad():
26
+ inputs = preprocess_text(text, tokenizer)
27
+ input_ids = inputs['input_ids']
28
+ attention_mask = inputs['attention_mask']
29
+ outputs = model(input_ids, attention_mask=attention_mask)
30
+ prediction = torch.argmax(outputs.logits, dim=1).item()
31
+ end_time = time.time()
32
+ processing_time = end_time - start_time
33
+ return prediction, processing_time
34
+
35
+ def main():
36
+ model_path = '/Users/ameedjamous/programming/OpenTextShield/src/BERT/training/bert_sms_spam_phishing_model'
37
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
38
+
39
+ # Load the trained model
40
+ model = load_model(model_path)
41
+
42
+ # Sample text to classify
43
+ sample_text = "Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's, Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's, Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 3838329832983092823098320983209823902389028239038329083290. Text FA to 87121 to receive entry question(std txt rate). T&C's apply 08452810075over18's"
44
+
45
+ # Get prediction and processing time
46
+ prediction, processing_time = predict(sample_text, model, tokenizer)
47
+
48
+ # Convert numerical prediction back to label
49
+ label_map = {0: 'ham', 1: 'spam', 2: 'phishing'}
50
+ print(f"The provided text is predicted as: {label_map[prediction]}")
51
+
52
+ # Determine the emoji based on processing time
53
+ emoji = "😊" if processing_time <= 0.2 else "😔"
54
+ print(f"Processing time: {processing_time:.2f} seconds {emoji}")
55
+
56
+ if __name__ == '__main__':
57
+ main()
BERT/training/.DS_Store ADDED
Binary file (6.15 kB). View file
 
BERT/training/bert_sms_spam_phishing_model/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.36.2",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 30522
37
+ }
BERT/training/bert_sms_spam_phishing_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a5868743b6b8725be1192dfaba15ff7677d5a70525bbf83443ce5001f893390
3
+ size 437961724
BERT/training/phishing_urls.xlsx ADDED
Binary file (30.6 kB). View file
 
BERT/training/train.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader, Dataset
3
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import accuracy_score
6
+ import pandas as pd
7
+ import chardet
8
+
9
+ # Dataset Class
10
+ class TextDataset(Dataset):
11
+ def __init__(self, texts, labels, tokenizer, max_len):
12
+ self.texts = texts
13
+ self.labels = labels
14
+ self.tokenizer = tokenizer
15
+ self.max_len = max_len
16
+
17
+ def __len__(self):
18
+ return len(self.texts)
19
+
20
+ def __getitem__(self, item):
21
+ text = str(self.texts[item])
22
+ label = self.labels[item]
23
+
24
+ encoding = self.tokenizer.encode_plus(
25
+ text,
26
+ add_special_tokens=True,
27
+ max_length=self.max_len,
28
+ return_token_type_ids=False,
29
+ padding='max_length',
30
+ return_attention_mask=True,
31
+ return_tensors='pt',
32
+ truncation=True
33
+ )
34
+
35
+ return {
36
+ 'text': text,
37
+ 'input_ids': encoding['input_ids'].flatten(),
38
+ 'attention_mask': encoding['attention_mask'].flatten(),
39
+ 'labels': torch.tensor(label, dtype=torch.long)
40
+ }
41
+
42
+ # Data Loader Function
43
+ def create_data_loader(df, tokenizer, max_len, batch_size):
44
+ ds = TextDataset(
45
+ texts=df.text.to_numpy(),
46
+ labels=df.label.to_numpy(),
47
+ tokenizer=tokenizer,
48
+ max_len=max_len
49
+ )
50
+
51
+ return DataLoader(ds, batch_size=batch_size, num_workers=4)
52
+
53
+ def main():
54
+ # Detect encoding
55
+ with open('sms_spam_phishing_dataset.csv', 'rb') as f:
56
+ result = chardet.detect(f.read())
57
+ file_encoding = result['encoding']
58
+
59
+ print("Detected encoding:", file_encoding)
60
+
61
+ # Load Dataset
62
+ df = pd.read_csv('sms_spam_phishing_dataset.csv', encoding=file_encoding)
63
+ df['label'] = df['label'].map({'ham': 0, 'spam': 1, 'phishing': 2}) # Convert labels to numerical
64
+
65
+ # Parameters
66
+ BATCH_SIZE = 16
67
+ MAX_LEN = 128
68
+ EPOCHS = 3
69
+
70
+ # Split Data
71
+ train_df, test_df = train_test_split(df, test_size=0.1)
72
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
73
+
74
+ # Create Data Loaders
75
+ train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
76
+ test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
77
+
78
+ # Load BERT Model
79
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
80
+
81
+ # Optimizer
82
+ optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
83
+
84
+ # Device
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+ model = model.to(device)
87
+
88
+ # Training Loop
89
+ for epoch in range(EPOCHS):
90
+ model.train()
91
+ for batch in train_data_loader:
92
+ input_ids = batch['input_ids'].to(device)
93
+ attention_mask = batch['attention_mask'].to(device)
94
+ labels = batch['labels'].to(device)
95
+
96
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
97
+ loss = outputs.loss
98
+ loss.backward()
99
+ optimizer.step()
100
+ optimizer.zero_grad()
101
+
102
+ print(f"Epoch {epoch + 1}/{EPOCHS} completed.")
103
+
104
+ # Evaluate
105
+ model.eval()
106
+ predictions, true_labels = [], []
107
+ for batch in test_data_loader:
108
+ input_ids = batch['input_ids'].to(device)
109
+ attention_mask = batch['attention_mask'].to(device)
110
+ labels = batch['labels'].to(device)
111
+
112
+ with torch.no_grad():
113
+ outputs = model(input_ids, attention_mask=attention_mask)
114
+
115
+ logits = outputs.logits
116
+ predictions.extend(torch.argmax(logits, dim=1).tolist())
117
+ true_labels.extend(labels.tolist())
118
+
119
+ accuracy = accuracy_score(true_labels, predictions)
120
+ print(f"Test Accuracy: {accuracy * 100:.2f}%")
121
+
122
+ # Save the Model
123
+ model.save_pretrained('bert_sms_spam_phishing_model')
124
+
125
+ if __name__ == '__main__':
126
+ main()
FastText/.DS_Store ADDED
Binary file (6.15 kB). View file
 
FastText/tests/stressTest_500.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import concurrent.futures
3
+ import time
4
+
5
+ # API endpoint and headers
6
+ url = 'http://localhost:8000/predict/'
7
+ headers = {
8
+ 'accept': 'application/json',
9
+ 'Content-Type': 'application/json',
10
+ }
11
+ data = '{"text":"Hello World"}'
12
+
13
+ # Function to send a single API request
14
+ def send_request():
15
+ try:
16
+ response = requests.post(url, headers=headers, data=data)
17
+ return response.status_code
18
+ except requests.exceptions.RequestException as e:
19
+ return str(e)
20
+
21
+ # Function to send 500 requests and measure the time taken
22
+ def send_500_requests_and_measure_time():
23
+ start_time = time.time() # Start time
24
+
25
+ with concurrent.futures.ThreadPoolExecutor() as executor:
26
+ # Creating a list of 500 future objects
27
+ futures = [executor.submit(send_request) for _ in range(500)]
28
+ # Waiting for all the futures to complete
29
+ concurrent.futures.wait(futures)
30
+
31
+ end_time = time.time() # End time
32
+
33
+ duration = end_time - start_time
34
+ print(f"Completed sending 500 requests in {duration:.2f} seconds")
35
+
36
+ # Send 500 requests and measure time
37
+ send_500_requests_and_measure_time()
38
+
FastText/training/ots_sms_model_v1.1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed771b7699999fdc20c9cac7aa3f8dfd29b409ea826925f3a715e9c1a37b6abd
3
+ size 810920792
FastText/training/test_sms.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+
3
+ # Load the trained model
4
+ model = fasttext.load_model('ots_sms_model_v1.1.bin')
5
+
6
+ # Example SMS message
7
+
8
+ # Phishing Examples
9
+ # message = "URGENT: Your bank account has been compromised. To secure your funds, click here immediately: http://bit.ly/2FAKEurl"
10
+ # message = "Warning: Your email has been flagged for unusual activity. Confirm your credentials now at http://bit.ly/com32323 to avoid suspension."
11
+ message = "Paket im Terminal 2: http://different.com/7DH5000022" ## phishing
12
+
13
+ # Verification Code Examples
14
+ # message = "Google Verification Code: 452183. Do not share this code with anyone."
15
+ # message = "Your WhatsApp code: 729-113. Don't share this code with others."
16
+
17
+ # Spam Examples
18
+ # message = "Congratulations! You've won a $500 Amazon gift card. Claim now at www.win-gifts.com. Hurry, offer ends soon!"
19
+ # message = "Exclusive Offer: Get 90% off on your next purchase at SuperShoes. Visit www.someshoes.com/offer10. Unsubscribe at stopDIA.com"
20
+
21
+ # Regular Communication (Ham) Examples
22
+ # message = "Hey, are we still on for dinner tonight at 7?"
23
+ # message = "Can you send me the presentation slides? I'd like to review them before the meeting."
24
+
25
+ # More Phishing Examples
26
+ # message = "Final Notice: Your subscription with NetMedia is about to expire. Renew now to avoid service interruption: www.netmedia-renewal.com"
27
+ # message = "Your package could not be delivered due to unpaid customs fee. Pay now at www.other_domain.com/438484sjdjdjdsjjdsjdsjdjsjdsjdjsdjs"
28
+
29
+ # More Spam Examples
30
+ # message = "Hot Summers Sale! Buy one get one free on all beachwear at TrendyStyles. Shop now at www.trendystyles.com/sale"
31
+ # message = "Get a free cell data booster with every new phone plan at MobileNet. Call us at 800-555-0199 or visit www.mobilenet.com"
32
+
33
+ # More Verification Code Examples
34
+ # message = "Your Tinder verification code is 394857. Please enter this code to continue."
35
+ # message = "Dropbox: Your security code is 842159. Enter this code to complete the setup."
36
+
37
+ # More Regular Communication (Ham) Examples
38
+ # message = "Reminder: Your dentist appointment is scheduled for tomorrow at 10 AM."
39
+ # message = "Great meeting today! Let's catch up next week to discuss further steps."
40
+
41
+ # Test Data that the model have never seen before with type Phishing
42
+
43
+ # message = "ALERT: Unusual sign-in detected on your Netflix account. Verify immediately at http://netflix-verify-login.com to prevent suspension."
44
+ # message = "Your PayPal account has been temporarily restricted. Please update your information at http://paypal-secure-update.com to restore access."
45
+ # message = "Your Apple ID is due for verification. Failure to verify may lead to account suspension. Visit http://appleids-verify-n0w.com promptly."
46
+ message = "Security Alert: We've detected unusual activity on your Apple account. Please verify your information immediately at http://apple-resetpassword.com to avoid suspension."
47
+
48
+
49
+
50
+ # Predict the label
51
+ label, probability = model.predict(message)
52
+
53
+ print(f"Label: {label[0]}, Probability: {probability[0]}")
FastText/training/train.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+ import pandas as pd
3
+
4
+
5
+ # Try different encodings if UTF-8 does not work
6
+ try:
7
+ data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='utf-8')
8
+ except UnicodeDecodeError:
9
+ data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='ISO-8859-1') # Try latin1 encoding
10
+
11
+
12
+ # Preprocess data: format as fastText expects (each line: "__label__<label> <text>")
13
+ data['ft_format'] = data.apply(lambda row: f'__label__{row["Label"]} {row["Message"]}', axis=1)
14
+
15
+ # Save preprocessed data
16
+ data['ft_format'].to_csv('ft_data.txt', index=False, header=False)
17
+
18
+ # Train a supervised model
19
+ model = fasttext.train_supervised(input='ft_data.txt', epoch=25, lr=1.0, wordNgrams=2)
20
+
21
+ # Save the model
22
+ model.save_model('ots_sms_model_v1.1.bin')
23
+
README.md CHANGED
@@ -1,3 +1,83 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## What is Open Source Text Shield (OTS)?
2
+
3
+ OTS (Open Source Text Shield) is an AI-driven solution designed to enhance the security of telecom networks by detecting and filtering spam and phishing messages in real time. This application leverages both BERT and FastText models for efficient text classification.
4
+
5
+ ## Getting Started
6
+
7
+ ### Prerequisites
8
+
9
+ - Python 3.8 or later
10
+ - FastAPI
11
+ - pydantic
12
+ - torch
13
+ - transformers
14
+ - fasttext
15
+
16
+ You can install the necessary libraries using pip:
17
+
18
+ ```bash
19
+ pip install fastapi pydantic torch transformers fasttext
20
+ ```
21
+
22
+ ### Installation
23
+
24
+ Clone the repository to your local machine:
25
+
26
+ ```bash
27
+ git clone https://github.com/TelecomsXChangeAPi/OpenTextShield/
28
+
29
+ ```
30
+
31
+ Navigate to the cloned directory:
32
+
33
+ ```bash
34
+ cd OpenTextShield
35
+ ```
36
+
37
+ ### Running the Application
38
+
39
+ Start the server by running:
40
+
41
+ ```bash
42
+ uvicorn main:app --host 0.0.0.0 --port 8001
43
+ ```
44
+
45
+ The application will be available at `http://localhost:8001`.
46
+
47
+ ### Usage
48
+
49
+ #### Predicting SMS
50
+
51
+ To predict if an SMS is spam, phishing, or ham (regular message), send a POST request to `/predict/` with a JSON body containing the SMS text and the model to use (`bert` or `fasttext`).
52
+
53
+ Example using curl:
54
+
55
+ ```bash
56
+ curl -X POST "http://localhost:8001/predict/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"text\":\"Your SMS content here\",\"model\":\"bert\"}"
57
+ ```
58
+
59
+ #### Feedback Loop
60
+
61
+ To provide feedback on predictions, send a POST request to `/feedback-loop/` with relevant feedback data.
62
+
63
+ Example using curl:
64
+
65
+ ```bash
66
+ curl -X POST "http://localhost:8001/feedback-loop/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"content\":\"SMS content\",\"feedback\":\"Your feedback here\",\"thumbs_up\":true,\"thumbs_down\":false,\"user_id\":\"user123\",\"model\":\"bert\"}"
67
+ ```
68
+
69
+ #### Download Feedback
70
+
71
+ To download the feedback data for a specific model, send a GET request to `/download-feedback/{model_name}`.
72
+
73
+ Example using curl:
74
+
75
+ ```bash
76
+ curl -X GET "http://localhost:8001/download-feedback/bert"
77
+ ```
78
+
79
+
80
+ ## Acknowledgements
81
+
82
+ Special thanks to the team at TelecomsXChange (TCXC) for their invaluable contributions to this project.
83
+
api-interface/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## What is Open Source Text Shield (OTS)?
2
+
3
+ OTS (Open Source Text Shield) is an AI-driven solution designed to enhance the security of telecom networks by detecting and filtering spam and phishing messages in real time. This application leverages both BERT and FastText models for efficient text classification.
4
+
5
+ ## Getting Started
6
+
7
+ ### Prerequisites
8
+
9
+ - Python 3.8 or later
10
+ - FastAPI
11
+ - pydantic
12
+ - torch
13
+ - transformers
14
+ - fasttext
15
+
16
+ You can install the necessary libraries using pip:
17
+
18
+ ```bash
19
+ pip install fastapi pydantic torch transformers fasttext
20
+ ```
21
+
22
+ ### Installation
23
+
24
+ Clone the repository to your local machine:
25
+
26
+ ```bash
27
+ git clone https://github.com/TelecomsXChangeAPi/OpenTextShield/
28
+
29
+ ```
30
+
31
+ Navigate to the cloned directory:
32
+
33
+ ```bash
34
+ cd OpenTextShield
35
+ ```
36
+
37
+ ### Running the Application
38
+
39
+ Start the server by running:
40
+
41
+ ```bash
42
+ uvicorn main:app --host 0.0.0.0 --port 8001
43
+ ```
44
+
45
+ The application will be available at `http://localhost:8001`.
46
+
47
+ ### Usage
48
+
49
+ #### Predicting SMS
50
+
51
+ To predict if an SMS is spam, phishing, or ham (regular message), send a POST request to `/predict/` with a JSON body containing the SMS text and the model to use (`bert` or `fasttext`).
52
+
53
+ Example using curl:
54
+
55
+ ```bash
56
+ curl -X POST "http://localhost:8001/predict/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"text\":\"Your SMS content here\",\"model\":\"bert\"}"
57
+ ```
58
+
59
+ #### Feedback Loop
60
+
61
+ To provide feedback on predictions, send a POST request to `/feedback-loop/` with relevant feedback data.
62
+
63
+ Example using curl:
64
+
65
+ ```bash
66
+ curl -X POST "http://localhost:8001/feedback-loop/" -H "accept: application/json" -H "Content-Type: application/json" -d "{\"content\":\"SMS content\",\"feedback\":\"Your feedback here\",\"thumbs_up\":true,\"thumbs_down\":false,\"user_id\":\"user123\",\"model\":\"bert\"}"
67
+ ```
68
+
69
+ #### Download Feedback
70
+
71
+ To download the feedback data for a specific model, send a GET request to `/download-feedback/{model_name}`.
72
+
73
+ Example using curl:
74
+
75
+ ```bash
76
+ curl -X GET "http://localhost:8001/download-feedback/bert"
77
+ ```
78
+
79
+
80
+ ## Acknowledgements
81
+
82
+ Special thanks to the team at TelecomsXChange (TCXC) for their invaluable contributions to this project.
83
+
api-interface/app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Optional
3
+ from fastapi import FastAPI, HTTPException, Request, Depends
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from pydantic import BaseModel
6
+ from datetime import datetime
7
+ from fastapi.responses import FileResponse
8
+ import torch
9
+ from transformers import BertTokenizer, BertForSequenceClassification
10
+ import fasttext
11
+ import csv
12
+
13
+ app = FastAPI()
14
+
15
+ # Allowed SMPP, SMSC or any External IP addresses
16
+ ALLOWED_IPS = {"127.0.0.1", "localhost", "10.0.0.1"}
17
+
18
+ # Add CORSMiddleware to allow cross-origin requests
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"],
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+
28
+ # Load BERT model
29
+ bert_model_path = "../BERT/training/bert_sms_spam_phishing_model"
30
+ bert_model = BertForSequenceClassification.from_pretrained(bert_model_path)
31
+ bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
32
+ bert_model.eval()
33
+
34
+ # Load FastText model
35
+ fasttext_model_path = "../FastText/training/ots_sms_model_v1.1.bin"
36
+ fasttext_model = fasttext.load_model(fasttext_model_path)
37
+
38
+ class SMS(BaseModel):
39
+ text: str
40
+ model: str # "bert" or "fasttext"
41
+
42
+ class Feedback(BaseModel):
43
+ content: str
44
+ feedback: str
45
+ thumbs_up: bool
46
+ thumbs_down: bool
47
+ user_id: Optional[str] = None
48
+ model: str # "bert" or "fasttext"
49
+
50
+ def preprocess_text(text, tokenizer, max_len=128):
51
+ return tokenizer.encode_plus(
52
+ text, add_special_tokens=True, max_length=max_len,
53
+ padding='max_length', return_attention_mask=True,
54
+ return_tensors='pt', truncation=True
55
+ )
56
+ def write_feedback(feedback_data, model_name):
57
+ file_name = f"feedback_{model_name}.csv"
58
+ with open(file_name, mode="a", newline="", encoding="utf-8") as file:
59
+ writer = csv.writer(file)
60
+ if file.tell() == 0:
61
+ writer.writerow(["Timestamp", "UserID", "Content", "Feedback", "Thumbs Up", "Thumbs Down"])
62
+ writer.writerow(feedback_data)
63
+
64
+ def verify_ip_address(request: Request):
65
+ client_host = request.client.host
66
+ if client_host not in ALLOWED_IPS:
67
+ raise HTTPException(status_code=403, detail="Access denied")
68
+ return client_host
69
+
70
+
71
+ # Route to predict SMS using specified model - supported "bert" , "fasttext"
72
+
73
+ @app.post("/predict/", dependencies=[Depends(verify_ip_address)])
74
+ async def predict_sms(sms: SMS):
75
+ start_time = time.time()
76
+
77
+ if not sms.text:
78
+ raise HTTPException(status_code=400, detail="Text is empty")
79
+
80
+ if sms.model == "bert":
81
+ inputs = preprocess_text(sms.text, bert_tokenizer)
82
+ with torch.no_grad():
83
+ outputs = bert_model(**inputs)
84
+ prediction = torch.argmax(outputs.logits, dim=1).item()
85
+ label_map = {0: 'ham', 1: 'spam', 2: 'phishing'}
86
+ label = label_map[prediction]
87
+ probability = torch.nn.functional.softmax(outputs.logits, dim=1).max().item()
88
+ model_info = {"Model_Name": "OTS_bert", "Model_Version": "1.1.4"}
89
+ elif sms.model == "fasttext":
90
+ label, probability = fasttext_model.predict(sms.text, k=1) # Ensure k=1 for single label prediction
91
+ label = label[0].replace('__label__', '')
92
+ probability = probability[0] # Extract the probability value
93
+ model_info = {
94
+ "Model_Name": "OTS_fasttext",
95
+ "Model_Version": "1.1.4",
96
+ "Model_Author": "TelecomsXChange (TCXC)",
97
+ "Last_Training": "2023-12-21"
98
+ }
99
+ else:
100
+ raise HTTPException(status_code=400, detail="Invalid model type")
101
+
102
+ end_time = time.time()
103
+ return {
104
+ "label": label,
105
+ "probability": probability,
106
+ "processing_time": end_time - start_time,
107
+ **model_info,
108
+ "Model_Author": "TelecomsXChange (TCXC)",
109
+ "Last_Training": "2023-12-21" # Update accordingly
110
+ }
111
+
112
+ # Feedback loop and download feedback
113
+
114
+ @app.post("/feedback-loop/", dependencies=[Depends(verify_ip_address)])
115
+ async def feedback_loop(feedback: Feedback):
116
+ thumbs_up = 'Yes' if feedback.thumbs_up else 'No'
117
+ thumbs_down = 'Yes' if feedback.thumbs_down else 'No'
118
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
119
+ feedback_data = [timestamp, feedback.user_id, feedback.content, feedback.feedback, thumbs_up, thumbs_down]
120
+
121
+ if feedback.model in ["bert", "fasttext"]:
122
+ write_feedback(feedback_data, feedback.model)
123
+ else:
124
+ raise HTTPException(status_code=400, detail="Invalid model type")
125
+
126
+ return {"message": "Feedback received"}
127
+
128
+
129
+ @app.get("/download-feedback/{model_name}", dependencies=[Depends(verify_ip_address)])
130
+ async def download_feedback(model_name: str):
131
+ if model_name in ["bert", "fasttext"]:
132
+ file_path = f"feedback_{model_name}.csv"
133
+ else:
134
+ raise HTTPException(status_code=400, detail="Invalid model name")
135
+ return FileResponse(file_path, media_type='text/csv', filename=file_path)
136
+
137
+ if __name__ == "__main__":
138
+ import uvicorn
139
+ uvicorn.run(app, host="0.0.0.0", port=8001)