c4_original_openlm_3b / modeling_openlm.py

Upload folder using huggingface_hub

7e497b3 verified 6 months ago

8.76 kB

	from argparse import Namespace
	from torch.utils.checkpoint import checkpoint
	from transformers import PreTrainedModel
	from transformers.modeling_outputs import CausalLMOutputWithPast
	from open_lm.utils.transformers.hf_config import OpenLMConfig
	from open_lm.model import Transformer, create_params
	from open_lm.attention import get_attn_func, xformers_attn, torch_attn
	from open_lm.norms import get_norm_class
	import torch
	import torch.nn as nn
	from typing import Union, Tuple, Optional, List
	import os


	class OpenLMModel(PreTrainedModel):
	config_class = OpenLMConfig

	def __init__(self, config, **kwargs):
	# This has to be done before init as it sets makes sure hf config is correct
	if hasattr(config, "params"):
	params = config.params
	else:
	params_args_dict = config.params_args_dict
	if not params_args_dict.get("norm_type"):
	params_args_dict["norm_type"] = get_norm_class(params_args_dict["model_norm"])
	if not params_args_dict.get("attn_func"):
	params_args_dict["attn_func"] = get_attn_func(
	params_args_dict["attn_name"],
	params_args_dict["attn_activation"],
	params_args_dict["attn_seq_scalar"],
	params_args_dict["attn_seq_scalar_alpha"]
	)
	params = create_params(Namespace(**config.params_args_dict))
	config.set_params(params)
	super().__init__(config, **kwargs)

	self.supports_gradient_checkpointing = True
	self.model = Transformer(params)

	@property
	def gradient_checkpointing(self):
	return self.model.grad_checkpointing

	@gradient_checkpointing.setter
	def gradient_checkpointing(self, value):
	self.model.grad_checkpointing = value

	def forward(self, input_ids=None, inputs_embeds=None, **kwargs):
	return self.model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)


	class OpenLMforCausalLM(OpenLMModel):
	_keys_to_ignore_on_load_missing = [r"lm_head.weight"]

	def __init__(self, config, **kwargs):
	super().__init__(config, **kwargs)
	self.lm_head = None
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.tok_embeddings

	def set_input_embeddings(self, value):
	self.model.tok_embeddings = value

	def get_output_embeddings(self):
	return self.model.get_output_embeddings()

	def set_output_embeddings(self, new_embeddings):
	raise NotImplementedError

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = False,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	r"""
	Args:
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
	Returns:
	Example:
	```python
	>>> from transformers import AutoTokenizer, OpenLlamaForCausalLM
	>>> model = OpenLlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
	>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
	>>> prompt = "Hey, are you consciours? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")
	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
	```"""
	assert position_ids is None, "Position IDs are not supported"
	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	logits, _, past_key_values = self.model(
	input_ids=input_ids,
	inputs_embeds=inputs_embeds,
	past_key_values=past_key_values,
	use_cache=use_cache,
	attention_mask=attention_mask,
	)
	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss_fct = nn.CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, shift_logits.size(-1))
	shift_labels = shift_labels.view(-1).to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	output = CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, loss=loss)
	return output

	def prepare_inputs_for_generation(
	self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
	):
	if past_key_values is not None:
	past_length = past_key_values[0][0].shape[1]

	# Some generation methods already pass only the last input ID
	if input_ids.shape[1] > past_length:
	remove_prefix_length = past_length
	else:
	# Default to old behavior: keep only final ID
	remove_prefix_length = input_ids.shape[1] - 1

	input_ids = input_ids[:, remove_prefix_length:]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	@staticmethod
	def _reorder_cache(past_key_values, beam_idx):
	reordered_cache = ()
	for layer_past in past_key_values:
	reordered_cache += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
	return reordered_cache

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
	if (
	os.path.isdir(pretrained_model_name_or_path)
	and kwargs.get("config", None) is not None
	and getattr(kwargs["config"], "checkpoint_file", None) is not None
	):
	# Setting torch default dtype
	torch_dtype = getattr(kwargs["config"], "torch_dtype", None)
	if isinstance(torch_dtype, str):
	torch_dtype = getattr(torch, torch_dtype)
	if torch_dtype is not None:
	torch.set_default_dtype(torch_dtype)

	print("Loading checkpoint from directory")
	checkpoint_path = kwargs["config"].checkpoint_file
	checkpoint = torch.load(checkpoint_path)

	state_dict = checkpoint["state_dict"]
	state_dict = {x.replace("module.", ""): y for x, y in state_dict.items()}
	state_dict = {f"model.{x}": y for x, y in state_dict.items()}

	return super().from_pretrained(None, state_dict=state_dict, **kwargs)
	elif os.path.isdir(pretrained_model_name_or_path):
	# Load from a PyTorch checkpoint
	print("Loading checkpoint from directory")
	checkpoint_path = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
	state_dict = torch.load(checkpoint_path)

	# state_dict = {x.replace("module.", ""): y for x, y in state_dict.items()}
	state_dict = {f"model.{x}" if "model." not in x else x: y for x, y in state_dict.items()}

	return super().from_pretrained(pretrained_model_name_or_path, state_dict=state_dict, **kwargs)
	else:
	return super().from_pretrained(pretrained_model_name_or_path, **kwargs)