Spaces:

NimaKL
/

spamd

Build error

App Files Files Community

spamd / app.py

NimaKL

Update app.py

0429fcd over 2 years ago

raw

history blame

3.11 kB

	import streamlit as st
	from transformers import pipeline
	from textblob import TextBlob




	st.set_page_config(layout='wide', initial_sidebar_state='expanded')
	st.title("Spamd: Turkish Spam Detector")
	st.markdown("Enter the text you'd like to analyze for spam.")
	text = st.text_input("Enter the text you'd like to analyze for spam.")

	"""Spamd_SpamDetector_Turkish_BERT_22.09.2022.ipynb

	Original file is located at
	https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH
	"""



	import torch
	import numpy as np

	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
	from transformers import AutoModel
	model = AutoModel.from_pretrained("NimaKL/spamd_model")

	token_id = []
	attention_masks = []

	def preprocessing(input_text, tokenizer):
	'''
	Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
	- input_ids: list of token ids
	- token_type_ids: list of token type ids
	- attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
	'''
	return tokenizer.encode_plus(
	input_text,
	add_special_tokens = True,
	max_length = 32,
	pad_to_max_length = True,
	return_attention_mask = True,
	return_tensors = 'pt'
	)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	#Used for printing the name if the variables. Removing it will not intrupt the project.
	def namestr(obj, namespace):
	return [name for name in namespace if namespace[name] is obj]

	def predict(new_sentence):
	# We need Token IDs and Attention Mask for inference on the new sentence
	test_ids = []
	test_attention_mask = []

	# Apply the tokenizer
	encoding = preprocessing(new_sentence, tokenizer)

	# Extract IDs and Attention Mask
	test_ids.append(encoding['input_ids'])
	test_attention_mask.append(encoding['attention_mask'])
	test_ids = torch.cat(test_ids, dim = 0)
	test_attention_mask = torch.cat(test_attention_mask, dim = 0)

	# Forward pass, calculate logit predictions
	with torch.no_grad():
	output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

	prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'


	st.write('Input', namestr(new_sentence, globals()),': \n', new_sentence)
	# Remove the namestr(new_sentence, globals()) in case of an error
	st.write('Predicted Class: ', prediction,'\n----------------------------------\n')

	predict(text)



	'''
	@software{stefan_schweter_2020_3770924,
	author = {Stefan Schweter},
	title = {BERTurk - BERT models for Turkish},
	month = apr,
	year = 2020,
	publisher = {Zenodo},
	version = {1.0.0},
	doi = {10.5281/zenodo.3770924},
	url = {https://doi.org/10.5281/zenodo.3770924}
	}
	'''