import streamlit as st from transformers import pipeline from textblob import TextBlob pipe = pipeline('summarization') st.title("Spamd: Turkish Spam Detector") # -*- coding: utf-8 -*- """Spamd_SpamDetector_Turkish_BERT_22.09.2022.ipynb Original file is located at https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH """ #Cuda and PyTorch Versions must match https://pytorch.org/get-started/locally/ import csv data = [] # with open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile: # reader = csv.reader(csvfile, skipinitialspace=True) # data.append(tuple(next(reader))) # for Message, Group in reader: # data.append((int(Group), Message)) import pandas as pd df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';') df['Group']= df['Group'].replace(2, 0) # reader = open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile print(df) text = df.Message.values len(text) labels = df.Group.values len(labels) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased") import os os.environ['CUDA_LAUNCH_BLOCKING'] = "1" import torch token_id = [] attention_masks = [] def preprocessing(input_text, tokenizer): ''' Returns with the following fields: - input_ids: list of token ids - token_type_ids: list of token type ids - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True). ''' return tokenizer.encode_plus( input_text, add_special_tokens = True, max_length = 32, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt' ) for sample in text: encoding_dict = preprocessing(sample, tokenizer) token_id.append(encoding_dict['input_ids']) attention_masks.append(encoding_dict['attention_mask']) token_id = torch.cat(token_id, dim = 0) attention_masks = torch.cat(attention_masks, dim = 0) labels = torch.tensor(labels) import random import numpy as np from tabulate import tabulate def print_rand_sentence_encoding(): '''Displays tokens, token IDs and attention mask of a random text sample''' index = random.randint(0, len(text) - 1) tokens = tokenizer.tokenize(tokenizer.decode(token_id[index])) token_ids = [i.numpy() for i in token_id[index]] attention = [i.numpy() for i in attention_masks[index]] table = np.array([tokens, token_ids, attention]).T print(tabulate(table, headers = ['Tokens', 'Token IDs', 'Attention Mask'], tablefmt = 'fancy_grid')) print_rand_sentence_encoding() from sklearn.model_selection import train_test_split from torch.utils.data import Dataset, TensorDataset from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler val_ratio = 0.2 # Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf batch_size = 32 # Indices of the train and validation splits stratified by labels train_idx, val_idx = train_test_split( np.arange(len(labels)), test_size = val_ratio, shuffle = True, stratify = labels) # Train and validation sets train_set = TensorDataset(token_id[train_idx], attention_masks[train_idx], labels[train_idx]) val_set = TensorDataset(token_id[val_idx], attention_masks[val_idx], labels[val_idx]) # Prepare DataLoader train_dataloader = DataLoader( train_set, sampler = RandomSampler(train_set), batch_size = batch_size ) validation_dataloader = DataLoader( val_set, sampler = SequentialSampler(val_set), batch_size = batch_size ) def b_tp(preds, labels): '''Returns True Positives (TP): count of correct predictions of actual class 1''' return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)]) def b_fp(preds, labels): '''Returns False Positives (FP): count of wrong predictions of actual class 1''' return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)]) def b_tn(preds, labels): '''Returns True Negatives (TN): count of correct predictions of actual class 0''' return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)]) def b_fn(preds, labels): '''Returns False Negatives (FN): count of wrong predictions of actual class 0''' return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)]) def b_metrics(preds, labels): ''' Returns the following metrics: - accuracy = (TP + TN) / N - precision = TP / (TP + FP) - recall = TP / (TP + FN) - specificity = TN / (TN + FP) ''' preds = np.argmax(preds, axis = 1).flatten() labels = labels.flatten() tp = b_tp(preds, labels) tn = b_tn(preds, labels) fp = b_fp(preds, labels) fn = b_fn(preds, labels) b_accuracy = (tp + tn) / len(labels) b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan' b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan' b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan' return b_accuracy, b_precision, b_recall, b_specificity from transformers import AutoModel #!pip install torch.utils from transformers import BertForSequenceClassification, AdamW, BertConfig model = BertForSequenceClassification.from_pretrained( "dbmdz/bert-base-turkish-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False) optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5, eps = 1e-08 ) # Run on GPU model.cuda() from tqdm import trange device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf epochs = 5 for _ in trange(epochs, desc = 'Epoch'): # ========== Training ========== # Set model to training mode model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() # Forward pass train_output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask, labels = b_labels) # Backward pass train_output.loss.backward() optimizer.step() # Update tracking variables tr_loss += train_output.loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # ========== Validation ========== # Set model to evaluation mode model.eval() # Tracking variables val_accuracy = [] val_precision = [] val_recall = [] val_specificity = [] for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Forward pass eval_output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask) logits = eval_output.logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate validation metrics b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids) val_accuracy.append(b_accuracy) # Update precision only when (tp + fp) !=0; ignore nan if b_precision != 'nan': val_precision.append(b_precision) # Update recall only when (tp + fn) !=0; ignore nan if b_recall != 'nan': val_recall.append(b_recall) # Update specificity only when (tn + fp) !=0; ignore nan if b_specificity != 'nan': val_specificity.append(b_specificity) print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps)) print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy))) print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN') print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN') print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN') #Used for printing the name if the variables. Removing it will not intrupt the project. def namestr(obj, namespace): return [name for name in namespace if namespace[name] is obj] def predict(new_sentence): # We need Token IDs and Attention Mask for inference on the new sentence test_ids = [] test_attention_mask = [] # Apply the tokenizer encoding = preprocessing(new_sentence, tokenizer) # Extract IDs and Attention Mask test_ids.append(encoding['input_ids']) test_attention_mask.append(encoding['attention_mask']) test_ids = torch.cat(test_ids, dim = 0) test_attention_mask = torch.cat(test_attention_mask, dim = 0) # Forward pass, calculate logit predictions with torch.no_grad(): output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device)) prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal' print('Input', namestr(new_sentence, globals()),': \n', new_sentence) # Remove the namestr(new_sentence, globals()) in case of an error print('Predicted Class: ', prediction,'\n----------------------------------\n') #Textbox for text user is entering st.subheader("Enter the text you'd like to analyze for spam.") text = st.text_input('Enter text') #text is stored in this variable predict(text) ''' @software{stefan_schweter_2020_3770924, author = {Stefan Schweter}, title = {BERTurk - BERT models for Turkish}, month = apr, year = 2020, publisher = {Zenodo}, version = {1.0.0}, doi = {10.5281/zenodo.3770924}, url = {https://doi.org/10.5281/zenodo.3770924} } '''