Spaces:

thotranexe
/

toxicity

Runtime error

toxicity / app.py

thotran

final touches

e92903c about 1 year ago

8.84 kB

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.nn.functional as F
	from torch.utils.data import TensorDataset, DataLoader, Dataset
	from sklearn.metrics import roc_auc_score
	import re
	from stqdm import stqdm
	from typing import *
	import string
	from sklearn.model_selection import train_test_split
	from transformers import DistilBertTokenizer, AdamW
	from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import streamlit as st

	st.markdown("### Welcome to toxicity! A showcase for the TweetBert Model!")
	#config constants
	SEED = 42
	EPOCHS = 2
	SEQ_SIZE = 150
	BATCH_SIZE = 32
	PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

	#import all data
	data=pd.read_csv('./data/train.csv',engine='python',encoding='utf-8')
	test=pd.read_csv('./data/test.csv',engine='python',encoding='utf-8')
	test_labels=pd.read_csv('./data/test_labels.csv',engine='python',encoding='utf-8')
	sub=pd.read_csv('./data/sample_submission.csv',engine='python',encoding='utf-8')

	#setup data
	data.drop(columns='id',inplace=True)
	labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

	#text proccessing
	def cleanString(comment: str) -> str:
	#contrationcs
	comment = re.sub('n\'t', ' not', comment)
	comment = re.sub('\'m', ' am', comment)
	comment = re.sub('\'ve', ' have', comment)
	comment = re.sub('\'s', ' is', comment)
	#newline
	comment = comment.replace('\n', ' \n ')
	comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
	comment = comment.replace(r'[0-9]', '')
	comment = re.sub('[^a-zA-Z%]', ' ', comment)
	comment = re.sub('%', '', comment)
	comment = re.sub(r' +', ' ', comment)
	comment = re.sub(r'\n', ' ', comment)
	comment = re.sub(r' +', ' ', comment)
	comment = comment.strip()
	return comment

	data.comment_text=data.comment_text.map(cleanString)

	#tokenizer
	tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
	token_lens = []

	#for txt in stqdm(data.comment_text,desc="tokenizing"):
	# tokens = tokenizer.encode(txt, max_length=512)
	# token_lens.append(len(tokens))
	#^code above commented for HF runtime purposes, tokenizes comment_text for the bert model
	#test train split
	df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
	df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
	#set pytorch dataset
	class CommentDataset(Dataset):
	def __init__(self, comments, targets, tokenizer, max_len):
	assert len(comments) == len(targets)
	self.comments = comments
	self.targets = targets
	self.tokenizer = tokenizer
	self.max_len = max_len

	def __len__(self):
	return len(self.comments)

	def __getitem__(self, item):
	comment = str(self.comments[item])
	target = self.targets[item]

	encoding = self.tokenizer.encode_plus(comment,
	add_special_tokens=True,
	max_length=self.max_len,
	return_token_type_ids=False,
	pad_to_max_length=True,
	# padding='max_length',
	return_attention_mask=True,
	return_tensors='pt',
	)
	return {'review_text': comment,
	'input_ids': encoding['input_ids'].flatten(),
	'attention_mask': encoding['attention_mask'].flatten(),
	'targets': torch.tensor(target, dtype=torch.long)}

	def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
	ds = CommentDataset(comments=df.comment_text.to_numpy(),
	targets=df[labels].to_numpy(),
	tokenizer=tokenizer,
	max_len=max_len)

	return DataLoader(ds, batch_size=batch_size)

	#helper function to set seed
	def set_seed(seed):
	torch.manual_seed(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	np.random.seed(seed)

	set_seed(SEED)

	#gpu usage
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
	config.num_labels = len(labels)
	config.problem_type = "multi_label_classification"
	config.classifier_dropout = 0.2
	config.return_dict = True

	model = DistilBertForSequenceClassification(config)
	model = model.to(device)

	train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
	val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
	test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
	# training function
	def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
	"""
	hf = huggingface.
	"""
	model.train()

	for batch in stqdm(data_loader, desc="training"):
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	targets = batch["targets"].float().to(device)

	optimizer.zero_grad()

	with torch.set_grad_enabled(True):
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
	loss = outputs.loss
	loss.backward()
	optimizer.step()
	#evalute and keep best model
	def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
	model.eval()
	losses = []
	score = None

	for idx, batch in enumerate(stqdm(data_loader,desc="evaluating")):
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	targets = batch["targets"].float().to(device)
	with torch.set_grad_enabled(False):
	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
	if idx == 0:
	score = outputs.logits.cpu()
	else:
	score = torch.cat((score, outputs.logits.cpu()))
	losses.append(outputs.loss.item())
	return score, np.mean(losses)

	optimizer = AdamW(model.parameters(), lr=2e-5)
	best_val_loss = 9999.
	print('====START TRAINING====')
	# actuual training here
	#for epoch in stqdm(range(EPOCHS)):
	# print('-' * 10)
	# train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
	# _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
	# val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
	# y_pred_np = val_pred.numpy()
	# val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
	# if val_loss < best_val_loss:
	# best_val_loss = val_loss
	#torch.save(model.state_dict(), 'distill_bert.pt')
	# print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
	# once model is saved and generated no need to re run :)

	#PUSH MODEL TO HF
	#from huggingface_hub import notebook_login
	#notebook_login()
	#model.push_to_hub("tweetbert")
	#tokenizer.push_to_hub("tweetbert")

	#LOAD MODEL
	model=model = AutoModelForSequenceClassification.from_pretrained("thotranexe/tweetbert")
	model = model.to(device)

	#TEST MODEL
	#test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
	#print('====TEST RESULT====')
	#print(f'Log loss: {test_loss:.5}')
	#y_pred_np = test_pred.numpy()
	#test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
	#print(f'ROC AUC: {test_auc:.5}')

	#test_src_id = test.iloc[:, 0]
	#test.drop(columns='id', inplace=True)
	#test_labels.drop(columns='id', inplace=True)
	#test_src = pd.concat((test, test_labels), axis=1)

	#MAKE PREDICTIONS
	#test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
	#prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
	#prediction = torch.sigmoid(prediction).numpy()

	#SAVE RESULTS INTO SUBMISSION DATAFRAME
	#sub[labels] = prediction
	#sub.insert(1,"tweet",data.comment_text,True)
	#sub.to_csv("sub.csv", encoding='utf-8', index=False)
	#^commented above code, saved to csv to reduce wait/comput time on HF
	sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
	sub.drop(columns="id")
	st.dataframe(sub)
	st.write("here is a table of the tweets and the likelihood of each label :) loaded from a csv out of respect for your time")