Spaces:

Pranjal2041
/

SemSup-XC

Runtime error

App Files Files Community

SemSup-XC / cleaned_code /src /models.py

Pranjal2041

Initial Commit

4014562 over 1 year ago

raw

history blame

No virus

19.2 kB

	'''
	Initial Code taken from SemSup Repository.
	'''



	import torch
	from torch import nn
	import sys
	from transformers.modeling_outputs import SequenceClassifierOutput
	from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
	from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
	from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

	# Import configs
	from transformers.models.roberta.configuration_roberta import RobertaConfig
	from transformers.models.bert.configuration_bert import BertConfig
	import numpy as np
	# Loss functions
	from torch.nn import BCEWithLogitsLoss

	from typing import Optional, Union, Tuple, Dict, List

	import itertools

	MODEL_FOR_SEMANTIC_EMBEDDING = {
	"roberta": "RobertaForSemanticEmbedding",
	"bert": "BertForSemanticEmbedding",
	}

	MODEL_TO_CONFIG = {
	"roberta": RobertaConfig,
	"bert": BertConfig,
	}


	def getLabelModel(data_args, model_args):
	tokenizer = AutoTokenizer.from_pretrained(
	model_args.label_model_name_or_path,
	cache_dir=model_args.cache_dir,
	use_fast=model_args.use_fast_tokenizer,
	revision=model_args.model_revision,
	use_auth_token=True if model_args.use_auth_token else None,
	)

	model = AutoModel.from_pretrained(
	model_args.label_model_name_or_path,
	cache_dir=model_args.cache_dir,
	revision=model_args.model_revision,
	use_auth_token=True if model_args.use_auth_token else None,
	)

	return model, tokenizer



	class AutoModelForMultiLabelClassification:
	"""
	Class for choosing the right model class automatically.
	Loosely based on AutoModel classes in HuggingFace.
	"""

	@staticmethod
	def from_pretrained(args, *kwargs):
	# Check what type of model it is
	for key in MODEL_TO_CONFIG.keys():
	if type(kwargs['config']) == MODEL_TO_CONFIG[key]:
	class_name = getattr(sys.modules[__name__], MODEL_FOR_SEMANTIC_EMBEDDING[key])
	return class_name.from_pretrained(args, *kwargs)

	# If none of the models were chosen
	raise("This model type is not supported. Please choose one of {}".format(MODEL_FOR_SEMANTIC_EMBEDDING.keys()))

	from transformers import BertForSequenceClassification, BertTokenizer
	from transformers import RobertaForSequenceClassification, RobertaTokenizer
	from transformers import XLNetForSequenceClassification, XLNetTokenizer

	class BertForSemanticEmbedding(nn.Module):


	def __init__(self, config):
	# super().__init__(config)
	super().__init__()

	self.config = config

	self.coil = config.coil
	if self.coil:
	assert config.arch_type == 2
	self.token_dim = config.token_dim

	try: # Try catch was added to handle the ongoing hyper search experiments.
	self.arch_type = config.arch_type
	except:
	self.arch_type = 2


	try:
	self.colbert = config.colbert
	except:
	self.colbert = False

	if config.encoder_model_type == 'bert':
	# self.encoder = BertModel(config)
	if self.arch_type == 1:
	self.encoder = AutoModelForSequenceClassification.from_pretrained(
	'bert-base-uncased', output_hidden_states = True)
	else:
	self.encoder = AutoModel.from_pretrained(
	config.model_name_or_path
	)
	# self.encoder = AutoModelForSequenceClassification.from_pretrained(
	# 'bert-base-uncased', output_hidden_states = True).bert
	elif config.encoder_model_type == 'roberta':
	self.encoder = RobertaForSequenceClassification.from_pretrained(
	'roberta-base', num_labels = config.num_labels, output_hidden_states = True)
	elif config.encoder_model_type == 'xlnet':
	self.encoder = XLNetForSequenceClassification.from_pretrained(
	'xlnet-base-cased', num_labels = config.num_labels, output_hidden_states = True)


	print('Config is', config)

	if config.negative_sampling == 'none':
	if config.arch_type == 1:
	self.fc1 = nn.Linear(5 * config.hidden_size, 512 if config.semsup else config.num_labels)
	elif self.arch_type == 3:
	self.fc1 = nn.Linear(config.hidden_size, 256 if config.semsup else config.num_labels)

	if self.coil:
	self.tok_proj = nn.Linear(self.encoder.config.hidden_size, self.token_dim)


	self.dropout = nn.Dropout(0.1)
	self.candidates_topk = 10
	if config.negative_sampling != 'none':
	self.group_y = np.array([np.array([l for l in group]) for group in config.group_y])
	#np.load('datasets/EUR-Lex/label_group_lightxml_0.npy', allow_pickle=True)

	self.negative_sampling = config.negative_sampling

	self.min_positive_samples = 20

	self.semsup = config.semsup
	self.label_projection = None
	if self.semsup:# and config.hidden_size != config.label_hidden_size:
	if self.arch_type == 1:
	self.label_projection = nn.Linear(512, config.label_hidden_size, bias= False)
	elif self.arch_type == 2:
	self.label_projection = nn.Linear(self.encoder.config.hidden_size, config.label_hidden_size, bias= False)
	elif self.arch_type == 3:
	self.label_projection = nn.Linear(256, config.label_hidden_size, bias= False)


	# self.post_init()
	def compute_tok_score_cart(self, doc_reps, doc_input_ids, qry_reps, qry_input_ids, qry_attention_mask):
	if not self.colbert:
	qry_input_ids = qry_input_ids.unsqueeze(2).unsqueeze(3) # Q * LQ * 1 * 1
	doc_input_ids = doc_input_ids.unsqueeze(0).unsqueeze(1) # 1 * 1 * D * LD
	exact_match = doc_input_ids == qry_input_ids # Q * LQ * D * LD
	exact_match = exact_match.float()
	scores_no_masking = torch.matmul(
	qry_reps.view(-1, self.token_dim), # (Q * LQ) * d
	doc_reps.view(-1, self.token_dim).transpose(0, 1) # d * (D * LD)
	)
	scores_no_masking = scores_no_masking.view(
	qry_reps.shape[:2], doc_reps.shape[:2]) # Q * LQ * D * LD
	if self.colbert:
	scores, _ = scores_no_masking.max(dim=3)
	else:
	scores, _ = (scores_no_masking * exact_match).max(dim=3) # Q * LQ * D
	tok_scores = (scores * qry_attention_mask.reshape(-1, qry_attention_mask.shape[-1]).unsqueeze(2))[:, 1:].sum(1)
	return tok_scores

	def coil_eval_forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	desc_input_ids = None,
	desc_attention_mask = None,
	lab_reps = None,
	label_embeddings = None,
	clustered_input_ids = None,
	clustered_desc_ids = None,

	):

	outputs_doc, logits = self.forward_input_encoder(input_ids, attention_mask, token_type_ids)

	doc_reps = self.tok_proj(outputs_doc.last_hidden_state) # D * LD * d
	# lab_reps = self.tok_proj(outputs_lab.last_hidden_state @ self.label_projection.weight) # Q * LQ * d

	if clustered_input_ids is None:
	tok_scores = self.compute_tok_score_cart(
	doc_reps, input_ids,
	lab_reps, desc_input_ids.reshape(-1, desc_input_ids.shape[-1]), desc_attention_mask
	)
	else:
	tok_scores = self.compute_tok_score_cart(
	doc_reps, clustered_input_ids,
	lab_reps, clustered_desc_ids.reshape(-1, clustered_desc_ids.shape[-1]), desc_attention_mask
	)

	logits = self.semsup_forward(logits, label_embeddings.reshape(desc_input_ids.shape[0], desc_input_ids.shape[1], -1).contiguous(), same_labels= True)

	new_tok_scores = torch.zeros(logits.shape, device = logits.device)
	for i in range(tok_scores.shape[1]):
	stride = tok_scores.shape[0]//tok_scores.shape[1]
	new_tok_scores[i] = tok_scores[istride: istride + stride ,i]
	logits += new_tok_scores.contiguous()
	return logits



	def coil_forward(self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	desc_input_ids: Optional[List[int]] = None,
	desc_attention_mask: Optional[List[int]] = None,
	desc_inputs_embeds: Optional[torch.Tensor] = None,
	return_dict: Optional[bool] = None,
	clustered_input_ids = None,
	clustered_desc_ids = None,
	ignore_label_embeddings_and_out_lab = None,
	):
	# print(desc_input_ids.shape, desc_attention_mask.shape, desc_inputs_embeds.shape)
	outputs_doc, logits = self.forward_input_encoder(input_ids, attention_mask, token_type_ids)
	if ignore_label_embeddings_and_out_lab is not None:
	outputs_lab, label_embeddings = outputs_lab, label_embeddings
	else:
	outputs_lab, label_embeddings, _, _ = self.forward_label_embeddings(None, None, desc_input_ids = desc_input_ids, desc_attention_mask = desc_attention_mask, return_hidden_states = True, desc_inputs_embeds = desc_inputs_embeds)


	doc_reps = self.tok_proj(outputs_doc.last_hidden_state) # D * LD * d
	lab_reps = self.tok_proj(outputs_lab.last_hidden_state @ self.label_projection.weight) # Q * LQ * d

	if clustered_input_ids is None:
	tok_scores = self.compute_tok_score_cart(
	doc_reps, input_ids,
	lab_reps, desc_input_ids.reshape(-1, desc_input_ids.shape[-1]), desc_attention_mask
	)
	else:
	tok_scores = self.compute_tok_score_cart(
	doc_reps, clustered_input_ids,
	lab_reps, clustered_desc_ids.reshape(-1, clustered_desc_ids.shape[-1]), desc_attention_mask
	)


	logits = self.semsup_forward(logits, label_embeddings.reshape(desc_input_ids.shape[0], desc_input_ids.shape[1], -1).contiguous(), same_labels= True)

	new_tok_scores = torch.zeros(logits.shape, device = logits.device)
	for i in range(tok_scores.shape[1]):
	stride = tok_scores.shape[0]//tok_scores.shape[1]
	new_tok_scores[i] = tok_scores[istride: istride + stride ,i]
	logits += new_tok_scores.contiguous()

	loss_fn = BCEWithLogitsLoss()
	loss = loss_fn(logits, labels)

	if not return_dict:
	output = (logits,) + outputs_doc[2:] + (logits,)
	return ((loss,) + output) if loss is not None else output

	return SequenceClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs_doc.hidden_states,
	attentions=outputs_doc.attentions,
	)

	def semsup_forward(self, input_embeddings, label_embeddings, num_candidates = -1, list_to_set_mapping = None, same_labels = False):
	'''
	If same_labels = True, directly apply matrix multiplication
	else: num_candidates must not be -1, list_to_set_mapping must not be None
	'''
	if same_labels:
	logits = torch.bmm(input_embeddings.unsqueeze(1), label_embeddings.transpose(2,1)).squeeze(1)
	else:
	# TODO: Can we optimize this? Perhaps torch.bmm?
	logits = torch.stack(
	# For each batch point, calculate corresponding product with label embeddings
	[
	logit @ label_embeddings[list_to_set_mapping[inum_candidates: (i+1) num_candidates]].T for i,logit in enumerate(input_embeddings)
	]
	)

	return logits

	def forward_label_embeddings(self, all_candidate_labels, label_desc_ids, desc_input_ids = None, desc_attention_mask = None, desc_inputs_embeds = None, return_hidden_states = False):
	# Given the candidates, and corresponding
	# description numbers of labels
	# Returns the embeddings for unique label descriptions

	if desc_attention_mask is None:
	num_candidates = all_candidate_labels.shape[1]
	# Create a set to perform minimal number of operations on common labels
	label_desc_ids_list = list(zip(itertools.chain(label_desc_ids.detach().cpu().tolist()), itertools.chain(all_candidate_labels.detach().cpu().tolist())))
	print('Original Length: ', len(label_desc_ids_list))
	label_desc_ids_set = torch.tensor(list(set(label_desc_ids_list)))
	print('New Length: ', label_desc_ids_set.shape)

	m1 = {tuple(x):i for i, x in enumerate(label_desc_ids_set.tolist())}
	list_to_set_mapping = torch.tensor([m1[x] for x in label_desc_ids_list])
	descs = [
	self.tokenizedDescriptions[self.config.id2label[desc_lb[1].item()]][desc_lb[0]] for desc_lb in label_desc_ids_set
	]
	label_input_ids = torch.cat([
	desc['input_ids'] for desc in descs
	])

	label_attention_mask = torch.cat([
	desc['attention_mask'] for desc in descs
	])

	label_token_type_ids = torch.cat([
	desc['token_type_ids'] for desc in descs
	])
	label_input_ids = label_input_ids.to(label_desc_ids.device)
	label_attention_mask = label_attention_mask.to(label_desc_ids.device)
	label_token_type_ids = label_token_type_ids.to(label_desc_ids.device)
	label_embeddings = self.label_model(
	label_input_ids,
	attention_mask=label_attention_mask,
	token_type_ids=label_token_type_ids,
	).pooler_output
	else:
	list_to_set_mapping = None
	num_candidates = None
	if desc_inputs_embeds is not None:
	outputs = self.label_model(
	inputs_embeds = desc_inputs_embeds.reshape(desc_inputs_embeds.shape[0] * desc_inputs_embeds.shape[1],desc_inputs_embeds.shape[2], desc_inputs_embeds.shape[3]).contiguous(),
	attention_mask=desc_attention_mask.reshape(-1, desc_input_ids.shape[-1]).contiguous(),
	)
	else:
	outputs = self.label_model(
	desc_input_ids.reshape(-1, desc_input_ids.shape[-1]).contiguous(),
	attention_mask=desc_attention_mask.reshape(-1, desc_input_ids.shape[-1]).contiguous(),
	)
	label_embeddings = outputs.pooler_output
	if self.label_projection is not None:
	if return_hidden_states:
	return outputs, label_embeddings @ self.label_projection.weight, list_to_set_mapping, num_candidates
	else:
	return label_embeddings @ self.label_projection.weight, list_to_set_mapping, num_candidates
	else:
	return label_embeddings, list_to_set_mapping, num_candidates

	def forward_input_encoder(self, input_ids, attention_mask, token_type_ids, ):
	outputs = self.encoder(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	output_hidden_states=True if self.arch_type == 1 else False,
	)

	# Currently, method specified in LightXML is used
	if self.arch_type in [2,3]:
	logits = outputs[1]
	elif self.arch_type == 1:
	logits = torch.cat([outputs.hidden_states[-i][:, 0] for i in range(1, 5+1)], dim=-1)

	if self.arch_type in [1,3]:
	logits = self.dropout(logits)

	# No Sampling
	if self.arch_type in [1,3]:
	logits = self.fc1(logits)
	return outputs, logits

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	head_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cluster_labels: Optional[torch.Tensor] = None,
	all_candidate_labels: Optional[torch.Tensor] = None,
	label_desc_ids: Optional[List[int]] = None,
	desc_inputs_embeds : Optional[torch.Tensor] = None,
	desc_input_ids: Optional[List[int]] = None,
	desc_attention_mask: Optional[List[int]] = None,
	label_embeddings : Optional[torch.Tensor] = None,
	clustered_input_ids: Optional[torch.Tensor] = None,
	clustered_desc_ids: Optional[torch.Tensor] = None,
	) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:


	if self.coil:
	return self.coil_forward(
	input_ids,
	attention_mask,
	token_type_ids,
	labels,
	desc_input_ids,
	desc_attention_mask,
	desc_inputs_embeds,
	return_dict,
	clustered_input_ids,
	clustered_desc_ids,
	)

	# STEP 2: Forward pass through the input model

	outputs, logits = self.forward_input_encoder(input_ids, attention_mask, token_type_ids)

	if self.semsup:
	if desc_input_ids is None:
	all_candidate_labels = torch.arange(labels.shape[1]).repeat((labels.shape[0], 1))
	label_embeddings, list_to_set_mapping, num_candidates = self.forward_label_embeddings(all_candidate_labels, label_desc_ids)
	logits = self.semsup_forward(logits, label_embeddings, num_candidates, list_to_set_mapping)
	else:
	label_embeddings, _, _ = self.forward_label_embeddings(None, None, desc_input_ids = desc_input_ids, desc_attention_mask = desc_attention_mask, desc_inputs_embeds = desc_inputs_embeds)
	logits = self.semsup_forward(logits, label_embeddings.reshape(desc_input_ids.shape[0], desc_input_ids.shape[1], -1).contiguous(), same_labels= True)

	elif label_embeddings is not None:
	logits = self.semsup_forward(logits, label_embeddings.contiguous() @ self.label_projection.weight, same_labels= True)
	loss_fn = BCEWithLogitsLoss()
	loss = loss_fn(logits, labels)


	if not return_dict:
	output = (logits,) + outputs[2:] + (logits,)
	return ((loss,) + output) if loss is not None else output

	return SequenceClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)