beam_retriever_unofficial / sample_loading.py

Souradeep Nanda

Add usage instructions

6d0d030 10 months ago

16.8 kB

	from transformers import PreTrainedModel, PretrainedConfig
	from transformers import AutoModel, AutoConfig
	import torch
	import torch.nn as nn
	import math
	import random


	class RetrieverConfig(PretrainedConfig):
	model_type = "retriever"

	def __init__(
	self,
	encoder_model_name="microsoft/deberta-v3-large",
	max_seq_len=512,
	mean_passage_len=70,
	beam_size=1,
	gradient_checkpointing=False,
	use_label_order=False,
	use_negative_sampling=False,
	use_focal=False,
	use_early_stop=True,
	**kwargs
	):
	super().__init__(**kwargs)
	self.encoder_model_name = encoder_model_name
	self.max_seq_len = max_seq_len
	self.mean_passage_len = mean_passage_len
	self.beam_size = beam_size
	self.gradient_checkpointing = gradient_checkpointing
	self.use_label_order = use_label_order
	self.use_negative_sampling = use_negative_sampling
	self.use_focal = use_focal
	self.use_early_stop = use_early_stop


	class Retriever(PreTrainedModel):
	config_class = RetrieverConfig

	def __init__(self, config):
	super().__init__(config)
	encoder_config = AutoConfig.from_pretrained(config.encoder_model_name)
	self.encoder = AutoModel.from_pretrained(
	config.encoder_model_name, config=encoder_config
	)

	self.hop_classifier_layer = nn.Linear(encoder_config.hidden_size, 2)
	self.hop_n_classifier_layer = nn.Linear(encoder_config.hidden_size, 2)

	if config.gradient_checkpointing:
	self.encoder.gradient_checkpointing_enable()

	# Initialize weights and apply final processing
	self.post_init()

	def get_negative_sampling_results(self, context_ids, current_preds, sf_idx):
	closest_power_of_2 = 2 ** math.floor(math.log2(self.beam_size))
	powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
	slopes = torch.pow(0.5, powers)
	each_sampling_nums = [max(1, int(len(context_ids) * item)) for item in slopes]
	last_pred_idx = set()
	sampled_set = {}
	for i in range(self.beam_size):
	last_pred_idx.add(current_preds[i][-1])
	sampled_set[i] = []
	for j in range(len(context_ids)):
	if j in current_preds[i] or j in last_pred_idx:
	continue
	if set(current_preds[i] + [j]) == set(sf_idx):
	continue
	sampled_set[i].append(j)
	random.shuffle(sampled_set[i])
	sampled_set[i] = sampled_set[i][: each_sampling_nums[i]]
	return sampled_set

	def forward(self, q_codes, c_codes, sf_idx, hop=0):
	"""
	hop predefined
	"""
	device = q_codes[0].device
	total_loss = torch.tensor(0.0, device=device, requires_grad=True)
	# the input ids of predictions and questions remained by last hop
	last_prediction = None
	pre_question_ids = None
	loss_function = nn.CrossEntropyLoss()
	focal_loss_function = None
	if self.use_focal:
	focal_loss_function = FocalLoss()
	question_ids = q_codes[0]
	context_ids = c_codes[0]
	current_preds = []
	if self.training:
	sf_idx = sf_idx[0]
	sf = sf_idx
	hops = len(sf)
	else:
	hops = hop if hop > 0 else len(sf_idx[0])
	if len(context_ids) <= hops or hops < 1:
	return {"current_preds": [list(range(hops))], "loss": total_loss}
	mean_passage_len = (self.max_seq_len - 2 - question_ids.shape[-1]) // hops
	for idx in range(hops):
	if idx == 0:
	# first hop
	qp_len = [
	min(
	self.max_seq_len - 2 - (hops - 1 - idx) * mean_passage_len,
	question_ids.shape[-1] + c.shape[-1],
	)
	for c in context_ids
	]
	next_question_ids = []
	hop1_qp_ids = torch.zeros(
	[len(context_ids), max(qp_len) + 2], device=device, dtype=torch.long
	)
	hop1_qp_attention_mask = torch.zeros(
	[len(context_ids), max(qp_len) + 2], device=device, dtype=torch.long
	)
	if self.training:
	hop1_label = torch.zeros(
	[len(context_ids)], dtype=torch.long, device=device
	)
	for i in range(len(context_ids)):
	this_question_ids = torch.cat((question_ids, context_ids[i]))[
	: qp_len[i]
	]
	hop1_qp_ids[i, 1 : qp_len[i] + 1] = this_question_ids.view(-1)
	hop1_qp_ids[i, 0] = self.config.cls_token_id
	hop1_qp_ids[i, qp_len[i] + 1] = self.config.sep_token_id
	hop1_qp_attention_mask[i, : qp_len[i] + 1] = 1
	if self.training:
	if self.use_label_order:
	if i == sf_idx[0]:
	hop1_label[i] = 1
	else:
	if i in sf_idx:
	hop1_label[i] = 1
	next_question_ids.append(this_question_ids)
	hop1_encoder_outputs = self.encoder(
	input_ids=hop1_qp_ids, attention_mask=hop1_qp_attention_mask
	)[0][
	:, 0, :
	] # [doc_num, hidden_size]
	if self.training and self.gradient_checkpointing:
	hop1_projection = torch.utils.checkpoint.checkpoint(
	self.hop_classifier_layer, hop1_encoder_outputs
	) # [doc_num, 2]
	else:
	hop1_projection = self.hop_classifier_layer(
	hop1_encoder_outputs
	) # [doc_num, 2]

	if self.training:
	total_loss = total_loss + loss_function(hop1_projection, hop1_label)
	_, hop1_pred_documents = hop1_projection[:, 1].topk(
	self.beam_size, dim=-1
	)
	last_prediction = (
	hop1_pred_documents # used for taking new_question_ids
	)
	pre_question_ids = next_question_ids
	current_preds = [
	[item.item()] for item in hop1_pred_documents
	] # used for taking the orginal passage index of the current passage
	else:
	# set up the vectors outside the beam_size loop
	qp_len_total = {}
	max_qp_len = 0
	last_pred_idx = set()
	if self.training:
	# stop predicting if the current hop's predictions are wrong
	flag = False
	for i in range(self.beam_size):
	if self.use_label_order:
	if current_preds[i][-1] == sf_idx[idx - 1]:
	flag = True
	break
	else:
	if set(current_preds[i]) == set(sf_idx[:idx]):
	flag = True
	break
	if not flag and self.use_early_stop:
	break
	for i in range(self.beam_size):
	# expand the search space, and self.beam_size is the number of predicted passages
	pred_doc = last_prediction[i]
	# avoid iterativing over a duplicated passage, for example, it should be 9+8 instead of 9+9
	last_pred_idx.add(current_preds[i][-1])
	new_question_ids = pre_question_ids[pred_doc]
	qp_len = {}
	# obtain the sequence length which can be formed into the vector
	for j in range(len(context_ids)):
	if j in current_preds[i] or j in last_pred_idx:
	continue
	qp_len[j] = min(
	self.max_seq_len - 2 - (hops - 1 - idx) * mean_passage_len,
	new_question_ids.shape[-1] + context_ids[j].shape[-1],
	)
	max_qp_len = max(max_qp_len, qp_len[j])
	qp_len_total[i] = qp_len
	if len(qp_len_total) < 1:
	# skip if all the predictions in the last hop are wrong
	break
	if self.use_negative_sampling and self.training:
	# deprecated
	current_sf = [sf_idx[idx]] if self.use_label_order else sf_idx
	sampled_set = self.get_negative_sampling_results(
	context_ids, current_preds, sf_idx[: idx + 1]
	)
	vector_num = 1
	for k in range(self.beam_size):
	vector_num += len(sampled_set[k])
	else:
	vector_num = sum([len(v) for k, v in qp_len_total.items()])
	# set up the vectors
	hop_qp_ids = torch.zeros(
	[vector_num, max_qp_len + 2], device=device, dtype=torch.long
	)
	hop_qp_attention_mask = torch.zeros(
	[vector_num, max_qp_len + 2], device=device, dtype=torch.long
	)
	if self.training:
	hop_label = torch.zeros(
	[vector_num], dtype=torch.long, device=device
	)
	vec_idx = 0
	pred_mapping = []
	next_question_ids = []
	last_pred_idx = set()

	for i in range(self.beam_size):
	# expand the search space, and self.beam_size is the number of predicted passages
	pred_doc = last_prediction[i]
	# avoid iterativing over a duplicated passage, for example, it should be 9+8 instead of 9+9
	last_pred_idx.add(current_preds[i][-1])
	new_question_ids = pre_question_ids[pred_doc]
	for j in range(len(context_ids)):
	if j in current_preds[i] or j in last_pred_idx:
	continue
	if self.training and self.use_negative_sampling:
	if j not in sampled_set[i] and not (
	set(current_preds[i] + [j]) == set(sf_idx[: idx + 1])
	):
	continue
	# shuffle the order between documents
	pre_context_ids = (
	new_question_ids[question_ids.shape[-1] :].clone().detach()
	)
	context_list = [pre_context_ids, context_ids[j]]
	if self.training:
	random.shuffle(context_list)
	this_question_ids = torch.cat(
	(
	question_ids,
	torch.cat((context_list[0], context_list[1])),
	)
	)[: qp_len_total[i][j]]
	next_question_ids.append(this_question_ids)
	hop_qp_ids[
	vec_idx, 1 : qp_len_total[i][j] + 1
	] = this_question_ids
	hop_qp_ids[vec_idx, 0] = self.config.cls_token_id
	hop_qp_ids[
	vec_idx, qp_len_total[i][j] + 1
	] = self.config.sep_token_id
	hop_qp_attention_mask[vec_idx, : qp_len_total[i][j] + 1] = 1
	if self.training:
	if self.use_negative_sampling:
	if set(current_preds[i] + [j]) == set(
	sf_idx[: idx + 1]
	):
	hop_label[vec_idx] = 1
	else:
	# if self.use_label_order:
	if set(current_preds[i] + [j]) == set(
	sf_idx[: idx + 1]
	):
	hop_label[vec_idx] = 1
	# else:
	# if j in sf_idx:
	# hop_label[vec_idx] = 1
	pred_mapping.append(current_preds[i] + [j])
	vec_idx += 1

	assert len(pred_mapping) == hop_qp_ids.shape[0]
	hop_encoder_outputs = self.encoder(
	input_ids=hop_qp_ids, attention_mask=hop_qp_attention_mask
	)[0][
	:, 0, :
	] # [vec_num, hidden_size]
	# if idx == 1:
	# hop_projection_func = self.hop2_classifier_layer
	# elif idx == 2:
	# hop_projection_func = self.hop3_classifier_layer
	# else:
	# hop_projection_func = self.hop4_classifier_layer
	hop_projection_func = self.hop_n_classifier_layer
	if self.training and self.gradient_checkpointing:
	hop_projection = torch.utils.checkpoint.checkpoint(
	hop_projection_func, hop_encoder_outputs
	) # [vec_num, 2]
	else:
	hop_projection = hop_projection_func(
	hop_encoder_outputs
	) # [vec_num, 2]
	if self.training:
	if not self.use_focal:
	total_loss = total_loss + loss_function(
	hop_projection, hop_label
	)
	else:
	total_loss = total_loss + focal_loss_function(
	hop_projection, hop_label
	)
	_, hop_pred_documents = hop_projection[:, 1].topk(
	self.beam_size, dim=-1
	)
	last_prediction = hop_pred_documents
	pre_question_ids = next_question_ids
	current_preds = [
	pred_mapping[hop_pred_documents[i].item()]
	for i in range(self.beam_size)
	]

	res = {"current_preds": current_preds, "loss": total_loss}
	return res

	@staticmethod
	def convert_from_torch_state_dict_to_hf(
	state_dict_path, hf_checkpoint_path, config
	):
	"""
	Converts a PyTorch state dict to a Hugging Face pretrained checkpoint.

	:param state_dict_path: Path to the PyTorch state dict file.
	:param hf_checkpoint_path: Path where the Hugging Face checkpoint will be saved.
	:param config: An instance of RetrieverConfig or a dictionary for the model's configuration.
	"""
	# Load the configuration
	if isinstance(config, dict):
	config = RetrieverConfig(**config)

	# Initialize the model
	model = Retriever(config)

	# Load the state dict
	state_dict = torch.load(state_dict_path)
	model.load_state_dict(state_dict)

	# Save as a Hugging Face checkpoint
	model.save_pretrained(hf_checkpoint_path)

	@staticmethod
	def save_encoder_to_hf(state_dict_path, hf_checkpoint_path, config):
	"""
	Saves only the encoder part of the model to a specified Hugging Face checkpoint path.

	:param model: An instance of the Retriever model.
	:param hf_checkpoint_path: Path where the encoder checkpoint will be saved on Hugging Face.
	"""
	# Load the configuration
	if isinstance(config, dict):
	config = RetrieverConfig(**config)

	# Initialize the model
	model = Retriever(config)

	# Load the state dict
	state_dict = torch.load(state_dict_path)
	model.load_state_dict(state_dict)

	# Extract the encoder
	encoder = model.encoder

	# Save the encoder using Hugging Face's save_pretrained method
	encoder.save_pretrained(hf_checkpoint_path)


	model = Retriever.from_pretrained("scholarly-shadows-syndicate/beam_retriever_unofficial")