Spaces:

SoooSlooow
/

AdsClassifier

Runtime error

App Files Files Community

AdsClassifier / src /models /models_utils.py

SoooSlooow

upload src

d1ef404 over 1 year ago

raw

history blame contribute delete

No virus

24.9 kB

	import os
	import pickle
	import numpy as np
	import pandas as pd
	from gensim.models import KeyedVectors
	from collections import Counter
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from sklearn.metrics import roc_auc_score, precision_recall_curve
	import tqdm
	from copy import deepcopy
	import matplotlib.pyplot as plt
	from transformers import DistilBertTokenizer, DistilBertModel


	def get_roc_aucs(y, probas):
	y_onehot = pd.get_dummies(y)
	roc_auc_scores = []
	if y_onehot.shape[1] > 2:
	for i in range(y_onehot.shape[1]):
	roc_auc_scores.append(roc_auc_score(y_onehot[i], probas[:, i]))
	roc_auc_scores.append(roc_auc_score(y, probas, multi_class='ovo', average='macro'))
	else:
	roc_auc_scores.append(roc_auc_score(y, probas[:, 1]))
	return roc_auc_scores


	def get_max_f1_score(y, probas):
	if probas.shape[1] != 2:
	raise ValueError('Expected probabilities for 2 classes would be given')
	y_onehot = pd.get_dummies(y)
	f1_score = []
	threshold = []
	p, r, t = precision_recall_curve(y, probas[:, 1])
	f1_scores = 2 * p * r / (p + r + 0.001)
	threshold.append(t[np.argmax(f1_scores)])
	f1_score.append(np.max(f1_scores))
	return f1_score, threshold


	class RNN(nn.Module):

	def __init__(self, vectors, n_of_words, n_of_classes, num_layers, bidirectional):
	dim = vectors.shape[1]
	d = 2 if bidirectional else 1
	super().__init__()
	self.emb = nn.Embedding(n_of_words, dim)
	self.emb.load_state_dict({'weight': torch.tensor(vectors)})
	self.emb.weight.requires_grad = False
	self.gru = nn.GRU(input_size=dim, hidden_size=dim, batch_first=True,
	num_layers=num_layers, bidirectional=bidirectional)
	self.linear = nn.Linear(dim * num_layers * d, n_of_classes)

	def forward(self, batch):
	emb = self.emb(batch)
	_, last_state = self.gru(emb)
	last_state = torch.permute(last_state, (1, 0, 2)).reshape(1, batch.shape[0], -1).squeeze()
	out = self.linear(last_state.squeeze())
	if len(out.size()) == 1:
	out = out.unsqueeze(0)
	return out


	class DistilBERTClass(torch.nn.Module):
	def __init__(self, n_classes):
	super().__init__()
	self.l1 = DistilBertModel.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational')
	self.linear = torch.nn.Linear(768, n_classes)

	def forward(self, input_ids, attention_mask, token_type_ids):
	output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
	hidden_state = output_1[0]
	pooler = hidden_state[:, 0]
	output = self.linear(pooler)
	return output


	class BaseClassifier:

	def __init__(self, batch_size=16, epochs=100):
	self.batch_size = batch_size
	self.epochs = epochs
	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

	def preprocess_with_random_initialization(self, train_tokens):
	self.pad_idx = 0
	self.unk_idx = 1

	set_of_words = set()
	for tokens_string in train_tokens:
	set_of_words.update(tokens_string)

	self.idx_to_word = ['PADDING', 'UNK'] + list(set_of_words)
	self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
	self.amount_of_words = len(self.idx_to_word)

	self.vectors = np.zeros((len(self.idx_to_word), 300))
	self.vectors[0, :] = np.zeros(300)
	self.vectors[1:len(self.idx_to_word), :] = (np.random.rand(len(self.idx_to_word) - 1, 300) - 0.5) / 300

	def preprocess(self, vectors_file_path):
	self.emb = KeyedVectors.load_word2vec_format(vectors_file_path)

	self.pad_idx = 0
	self.unk_idx = 1

	self.idx_to_word = ['PADDING', 'UNK'] + list(self.emb.index_to_key)
	self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
	self.amount_of_words = len(self.idx_to_word)

	self.vectors = np.zeros((len(self.idx_to_word), 300))
	self.vectors[0, :] = np.zeros(300)
	self.vectors[1, :] = (np.random.rand(300) - 0.5) / 300
	for i in range(2, len(self.idx_to_word)):
	self.vectors[i, :] = self.emb.get_vector(self.idx_to_word[i])

	def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
	reinitialize=True, stop_epochs=None, show_logs=False):
	if reinitialize:
	self.n_of_classes = y_train.nunique()
	self.initialize_nnet()

	self.print_test = test_tokens and y_test
	self.stop_epochs = stop_epochs
	train_scores = []
	self.train_scores_mean = []
	self.test_scores = []
	self.test_aucs = []
	self.test_f1 = []
	criterion = nn.CrossEntropyLoss()
	for epoch in tqdm.tqdm(range(self.epochs)):
	self.epoch = epoch
	self.nnet.train()
	train_batches = self.batch_generator(train_tokens, y_train)
	test_batches = self.batch_generator(test_tokens, y_test)
	for i, batch in tqdm.tqdm(
	enumerate(train_batches),
	total=len(train_tokens) // self.batch_size
	):
	pred = self.nnet(batch['tokens'])
	loss = criterion(pred, batch['labels'])
	self.optimizer.zero_grad()
	loss.backward()
	self.optimizer.step()
	if show_logs and i % 400 == 0:
	train_score = criterion(self.nnet(batch['tokens']), batch['labels'])
	print(train_score.item())
	train_scores.append(train_score.item())
	if show_logs:
	self.train_scores_mean.append(sum(train_scores) / len(train_scores))
	train_scores = []
	if self.print_test:
	test_pred_prob = torch.tensor([], device='cpu')
	with torch.no_grad():
	self.nnet.eval()
	for batch in test_batches:
	test_batch_pred_prob = self.nnet(batch['tokens'])
	test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
	test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
	test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
	self.test_scores.append(test_score.item())
	test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
	self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
	self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
	self.print_metrics()
	if self.early_stopping_check():
	break

	def count_tokens(self, tokens):
	self.words_counter = Counter()
	self.amount_of_tokens = 0
	for s in tokens:
	self.words_counter.update(s)
	self.amount_of_tokens += len(s)

	def index_tokens(self, tokens_string):
	return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]

	def fill_with_pads(self, tokens):
	tokens = deepcopy(tokens)
	max_len = 0
	for tokens_string in tokens:
	max_len = max(max_len, len(tokens_string))
	for tokens_string in tokens:
	for i in range(len(tokens_string), max_len):
	tokens_string.append(self.pad_idx)
	return tokens

	def as_matrix(self, tokens):
	tokens = deepcopy(tokens)
	for j, s in enumerate(tokens):
	tokens[j] = self.index_tokens(s)
	tokens = self.fill_with_pads(tokens)
	return tokens

	def batch_generator(self, tokens, labels=None):
	for i in range(0, len(tokens), self.batch_size):
	batch_tokens = tokens[i: i + self.batch_size]
	if labels:
	batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
	dtype=torch.long,
	device=self.device)
	else:
	batch_labels = None

	batch_tokens_idx = torch.tensor(self.as_matrix(batch_tokens),
	dtype=torch.int,
	device=self.device)
	if len(batch_tokens_idx.size()) == 1:
	batch_tokens_idx = torch.unsqueeze(batch_tokens_idx, 0)

	batch = {
	'tokens': batch_tokens_idx,
	'labels': batch_labels
	}
	yield batch

	def print_metrics(self, print_test=True):

	if self.print_test:
	print(f'epoch {self.epoch}/{self.epochs}')
	print('auc', self.test_aucs[-1])
	print('score', self.test_scores[-1])
	print('f1 score', self.test_f1[-1])

	legend_labels = []
	if self.n_of_classes > 2:
	for i in range(self.n_of_classes):
	legend_labels.append(f'Class {i}')
	legend_labels.append('General')

	plt.figure(figsize=(5, 15))

	plt.clf()

	plt.subplot(3, 1, 1)
	plt.plot(np.arange(1, self.epoch + 2), self.test_aucs)
	plt.grid()
	plt.title('Test ROC AUC')
	plt.xlabel('Num. of epochs')
	plt.ylabel('ROC AUC')
	plt.legend(legend_labels)

	plt.subplot(3, 1, 2)
	plt.plot(np.arange(1, self.epoch + 2), self.test_f1)
	plt.grid()
	plt.title('Test F1-score')
	plt.xlabel('Num. of epochs')
	plt.ylabel('F1-score')
	plt.legend(legend_labels)

	plt.subplot(3, 1, 3)
	plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
	plt.plot(np.arange(1, self.epoch + 2), self.test_scores, label='Test loss')
	plt.title('Loss')
	plt.xlabel('Num. of epochs')
	plt.ylabel('Loss')
	plt.legend()
	plt.grid()
	plt.draw()

	else:
	plt.figure(figsize=(5, 15))
	plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
	plt.title('Loss')
	plt.xlabel('Num. of epochs')
	plt.ylabel('Loss')
	plt.legend()
	plt.grid()
	plt.show()

	def early_stopping_check(self):
	if self.stop_epochs is None or self.stop_epochs >= len(self.test_scores):
	return False
	else:
	print(self.test_scores)
	first_score = np.array(self.test_scores)[-self.stop_epochs - 1]
	last_scores = np.array(self.test_scores)[-self.stop_epochs:]
	return np.all(last_scores >= first_score)

	def predict_proba(self, tokens, labels):
	batches = self.batch_generator(tokens, labels)
	pred_probas = torch.tensor([], device=self.device)
	with torch.no_grad():
	self.nnet.eval()
	for batch in batches:
	batch_prob = self.nnet(batch['tokens'])
	pred_probas = torch.cat((pred_probas, batch_prob))
	return F.softmax(pred_probas).detach().cpu().numpy()


	class RNNClassifier(BaseClassifier):

	def __init__(self, batch_size=16, epochs=100,
	num_layers=1, bidirectional=False):
	self.batch_size = batch_size
	self.epochs = epochs
	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
	self.num_layers = num_layers
	self.bidirectional = bidirectional

	def initialize_nnet(self):
	self.nnet = RNN(self.vectors, self.amount_of_words,
	n_of_classes=self.n_of_classes,
	num_layers=self.num_layers,
	bidirectional=self.bidirectional).to(self.device)
	self.optimizer = torch.optim.Adam(self.nnet.parameters())

	def save_model(self, filepath):
	with open(filepath, 'wb') as file:
	torch.save(self.nnet.state_dict(), file)

	def load_model(self, filepath, amount_of_words):
	self.amount_of_words = amount_of_words
	self.vectors = np.zeros((amount_of_words, 300))
	self.n_of_classes = 2
	self.nnet = RNN(self.vectors, self.amount_of_words,
	n_of_classes=self.n_of_classes,
	num_layers=self.num_layers,
	bidirectional=self.bidirectional).to(self.device)
	self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))


	class DBERTClassifier(BaseClassifier):

	def __init__(self, batch_size=16, epochs=100):
	self.batch_size = batch_size
	self.epochs = epochs
	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

	def initialize_nnet(self):
	self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
	self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
	# 'DeepPavlov/rubert-base-cased' 'DeepPavlov/distilrubert-small-cased-conversational',
	self.tokenizer = DistilBertTokenizer.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational',
	do_lower_case=True)

	def batch_generator(self, tokens, labels=None):
	for i in range(0, len(tokens), self.batch_size):
	batch_tokens = tokens[i: i + self.batch_size]
	batch_tokens = [' '.join(s) for s in batch_tokens]
	if labels:
	batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
	dtype=torch.long,
	device=self.device)
	else:
	batch_labels = None
	if len(batch_tokens) == 1:
	inputs = self.tokenizer.encode_plus(
	batch_tokens,
	None,
	add_special_tokens=True,
	max_length=512,
	truncation=True,
	pad_to_max_length=True,
	return_token_type_ids=True
	)
	else:
	inputs = self.tokenizer.batch_encode_plus(
	batch_tokens,
	add_special_tokens=True,
	max_length=512,
	truncation=True,
	pad_to_max_length=True,
	return_token_type_ids=True
	)
	batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
	batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
	batch_token_type_ids = torch.tensor(inputs["token_type_ids"], device=self.device, dtype=torch.long)
	if len(batch_tokens) == 1:
	batch_token_ids = batch_token_ids.unsqueeze(0)
	batch_mask = batch_mask.unsqueeze(0)
	batch_token_type_ids = batch_token_type_ids.unsqueeze(0)
	batch = {
	'tokens': batch_token_ids,
	'mask': batch_mask,
	'token_type_ids': batch_token_type_ids,
	'labels': batch_labels
	}
	yield batch

	def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
	reinitialize=True, stop_epochs=None, show_logs=False):
	if reinitialize:
	self.n_of_classes = y_train.nunique()
	self.initialize_nnet()

	self.stop_epochs = stop_epochs
	self.print_test = test_tokens and y_test
	train_scores = []
	self.train_scores_mean = []
	self.test_scores = []
	self.test_aucs = []
	self.test_f1 = []
	criterion = nn.CrossEntropyLoss()
	for epoch in tqdm.tqdm(range(self.epochs)):
	self.epoch = epoch
	self.nnet.train()
	train_batches = self.batch_generator(train_tokens, y_train)
	test_batches = self.batch_generator(test_tokens, y_test)
	for i, batch in tqdm.tqdm(
	enumerate(train_batches),
	total=len(train_tokens) // self.batch_size
	):
	pred = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
	loss = criterion(pred, batch['labels'])
	self.optimizer.zero_grad()
	loss.backward()
	self.optimizer.step()
	if show_logs and i % 400 == 0:
	train_score = criterion(self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids']),
	batch['labels'])
	print(train_score.item())
	train_scores.append(train_score.item())
	if show_logs:
	self.train_scores_mean.append(sum(train_scores) / len(train_scores))
	train_scores = []
	if self.print_test:
	test_pred_prob = torch.tensor([], device='cpu')
	with torch.no_grad():
	self.nnet.eval()
	for batch in test_batches:
	test_batch_pred_prob = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
	test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
	test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
	test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
	self.test_scores.append(test_score.item())
	test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
	self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
	self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
	self.print_metrics()
	if self.early_stopping_check():
	break

	def predict_proba(self, tokens, labels):
	batches = self.batch_generator(tokens, labels)
	pred_probas = torch.tensor([], device=self.device)
	with torch.no_grad():
	self.nnet.eval()
	for batch in batches:
	batch_prob = self.nnet(batch['tokens'], batch['mask'],
	batch['token_type_ids'])
	pred_probas = torch.cat((pred_probas, batch_prob))
	return F.softmax(pred_probas).detach().cpu().numpy()

	def predict(self, tokens, labels):
	return np.argmax(self.predict_proba(tokens, labels), axis=1)

	def save_model(self, filepath):
	with open(filepath, 'wb') as file:
	torch.save(self.nnet.state_dict(), file)

	def load_model(self, filepath):
	self.n_of_classes = 2
	self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
	self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
	self.tokenizer = DistilBertTokenizer.from_pretrained(
	'DeepPavlov/distilrubert-small-cased-conversational',
	do_lower_case=True
	)
	self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))


	class AdClassifier:

	def __init__(self, weights_folder, dictionary_path):
	self.batch_size = 16

	self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

	self.pad_idx = 0
	self.unk_idx = 1

	with open(dictionary_path, 'rb') as file:
	self.word_to_idx = pickle.load(file)

	self.tokenizer = DistilBertTokenizer.from_pretrained(
	'DeepPavlov/distilrubert-small-cased-conversational',
	do_lower_case=True
	)

	nationality_nn_path = os.path.join(weights_folder, 'model_nationality.pt')
	families_nn_path = os.path.join(weights_folder, 'model_families.pt')
	sex_nn_path = os.path.join(weights_folder, 'model_sex.pt')
	limit_nn_path = os.path.join(weights_folder, 'model_limit.pt')

	self.nationality_clf = DBERTClassifier()
	self.nationality_clf.load_model(nationality_nn_path)

	self.families_clf = DBERTClassifier()
	self.families_clf.load_model(families_nn_path)

	self.sex_clf = DBERTClassifier()
	self.sex_clf.load_model(sex_nn_path)

	self.limit_clf = RNNClassifier(bidirectional=True)
	self.limit_clf.load_model(limit_nn_path, amount_of_words=len(self.word_to_idx))

	def index_tokens(self, tokens_string):
	return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]

	def fill_with_pads(self, tokens):
	tokens = deepcopy(tokens)
	max_len = 0
	for tokens_string in tokens:
	max_len = max(max_len, len(tokens_string))
	for tokens_string in tokens:
	for i in range(len(tokens_string), max_len):
	tokens_string.append(self.pad_idx)
	return tokens

	def as_matrix(self, tokens):
	tokens = deepcopy(tokens)
	for j, s in enumerate(tokens):
	tokens[j] = self.index_tokens(s)
	tokens = self.fill_with_pads(tokens)
	return tokens

	def batch_generator(self, tokens):
	for i in range(0, len(tokens), self.batch_size):
	batch_tokens = tokens[i: i + self.batch_size]
	batch_tokens = [' '.join(s) for s in batch_tokens]
	inputs = self.tokenizer.batch_encode_plus(
	batch_tokens,
	add_special_tokens=True,
	max_length=512,
	truncation=True,
	pad_to_max_length=True,
	return_token_type_ids=True
	)
	batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
	batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
	batch_token_type_ids = torch.tensor(inputs['token_type_ids'], device=self.device, dtype=torch.long)

	batch_tokens_rnn = tokens[i: i + self.batch_size]
	batch_tokens_rnn_ids = torch.tensor(self.as_matrix(batch_tokens_rnn),
	dtype=torch.int,
	device=self.device)
	batch = {
	'tokens': batch_token_ids,
	'mask': batch_mask,
	'token_type_ids': batch_token_type_ids,
	'tokens_rnn': batch_tokens_rnn_ids
	}
	yield batch

	def predict_probas(self, tokens):
	batches = self.batch_generator(tokens)
	pred_probas = {'nationality': torch.tensor([], device=self.device),
	'families': torch.tensor([], device=self.device),
	'sex': torch.tensor([], device=self.device),
	'limit': torch.tensor([], device=self.device)}
	batch_probas = dict()
	with torch.no_grad():
	self.nationality_clf.nnet.eval()
	self.families_clf.nnet.eval()
	self.sex_clf.nnet.eval()
	self.limit_clf.nnet.eval()
	for batch in batches:
	batch_probas['nationality'] = self.nationality_clf.nnet(batch['tokens'], batch['mask'],
	batch['token_type_ids'])
	batch_probas['families'] = self.families_clf.nnet(batch['tokens'], batch['mask'],
	batch['token_type_ids'])
	batch_probas['sex'] = self.sex_clf.nnet(batch['tokens'], batch['mask'],
	batch['token_type_ids'])
	batch_probas['limit'] = self.limit_clf.nnet(batch['tokens_rnn'])
	for batch_prob_label in batch_probas:
	pred_probas[batch_prob_label] = torch.cat((pred_probas[batch_prob_label],
	batch_probas[batch_prob_label]))
	for pred_prob_label in pred_probas:
	pred_probas[pred_prob_label] = F.softmax(pred_probas[pred_prob_label]).\
	detach().cpu().numpy()
	return pred_probas

	def predict_labels(self, tokens):
	predicted_probas = self.predict_probas(tokens)
	predicted_labels = dict()
	thresholds = {
	'nationality': 0.75,
	'families': 0.7,
	'sex': 0.25,
	'limit': 0.42
	}
	for label in predicted_probas:
	predicted_labels[label] = predicted_probas[label][:, 1] >= thresholds[label]
	return predicted_labels

	def save_model(self, filepath):
	with open(filepath, 'wb') as file:
	torch.save(self, file)