|
import streamlit as st |
|
from transformers import pipeline |
|
from textblob import TextBlob |
|
|
|
|
|
|
|
|
|
st.set_page_config(layout='wide', initial_sidebar_state='expanded') |
|
st.title("Spamd: Turkish Spam Detector") |
|
st.markdown("Enter the text you'd like to analyze for spam.") |
|
text = st.text_input("Enter the text you'd like to analyze for spam.") |
|
|
|
"""Spamd_SpamDetector_Turkish_BERT_22.09.2022.ipynb |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH |
|
""" |
|
|
|
|
|
|
|
import torch |
|
import numpy as np |
|
|
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased") |
|
from transformers import AutoModel |
|
model = AutoModel.from_pretrained("NimaKL/spamd_model") |
|
|
|
token_id = [] |
|
attention_masks = [] |
|
|
|
def preprocessing(input_text, tokenizer): |
|
''' |
|
Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields: |
|
- input_ids: list of token ids |
|
- token_type_ids: list of token type ids |
|
- attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True). |
|
''' |
|
return tokenizer.encode_plus( |
|
input_text, |
|
add_special_tokens = True, |
|
max_length = 32, |
|
pad_to_max_length = True, |
|
return_attention_mask = True, |
|
return_tensors = 'pt' |
|
) |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
def namestr(obj, namespace): |
|
return [name for name in namespace if namespace[name] is obj] |
|
|
|
def predict(new_sentence): |
|
|
|
test_ids = [] |
|
test_attention_mask = [] |
|
|
|
|
|
encoding = preprocessing(new_sentence, tokenizer) |
|
|
|
|
|
test_ids.append(encoding['input_ids']) |
|
test_attention_mask.append(encoding['attention_mask']) |
|
test_ids = torch.cat(test_ids, dim = 0) |
|
test_attention_mask = torch.cat(test_attention_mask, dim = 0) |
|
|
|
|
|
with torch.no_grad(): |
|
output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device)) |
|
|
|
prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal' |
|
|
|
|
|
st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence) |
|
|
|
st.write('Predicted Class: ', prediction,'\n----------------------------------\n') |
|
|
|
predict(text) |
|
|
|
|
|
|
|
''' |
|
@software{stefan_schweter_2020_3770924, |
|
author = {Stefan Schweter}, |
|
title = {BERTurk - BERT models for Turkish}, |
|
month = apr, |
|
year = 2020, |
|
publisher = {Zenodo}, |
|
version = {1.0.0}, |
|
doi = {10.5281/zenodo.3770924}, |
|
url = {https://doi.org/10.5281/zenodo.3770924} |
|
} |
|
''' |