Spaces:
Running
on
Zero
Running
on
Zero
from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel | |
from transformers.models.llama.configuration_llama import LlamaConfig | |
import torch.nn as nn | |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss | |
from transformers.modeling_outputs import ( | |
BaseModelOutputWithPast, | |
CausalLMOutputWithPast, | |
QuestionAnsweringModelOutput, | |
SequenceClassifierOutputWithPast, | |
) | |
from transformers.cache_utils import Cache | |
from transformers.modeling_outputs import ( | |
CausalLMOutputWithPast, | |
) | |
from transformers.utils import ( | |
add_start_docstrings_to_model_forward, | |
logging, | |
replace_return_docstrings, | |
) | |
from dataclasses import dataclass | |
from transformers.utils import ModelOutput | |
import torch | |
from typing import List, Optional, Tuple, Union | |
logger = logging.get_logger(__name__) | |
_CONFIG_FOR_DOC = "LlamaConfig" | |
LLAMA_INPUTS_DOCSTRING = r""" | |
Args: | |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): | |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide | |
it. | |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and | |
[`PreTrainedTokenizer.__call__`] for details. | |
[What are input IDs?](../glossary#input-ids) | |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: | |
- 1 for tokens that are **not masked**, | |
- 0 for tokens that are **masked**. | |
[What are attention masks?](../glossary#attention-mask) | |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and | |
[`PreTrainedTokenizer.__call__`] for details. | |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see | |
`past_key_values`). | |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] | |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more | |
information on the default strategy. | |
- 1 indicates the head is **not masked**, | |
- 0 indicates the head is **masked**. | |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, | |
config.n_positions - 1]`. | |
[What are position IDs?](../glossary#position-ids) | |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): | |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention | |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` | |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. | |
Two formats are allowed: | |
- a [`~cache_utils.Cache`] instance; | |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of | |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy | |
cache format. | |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the | |
legacy cache format will be returned. | |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't | |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` | |
of shape `(batch_size, sequence_length)`. | |
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): | |
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This | |
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the | |
model's internal embedding lookup matrix. | |
use_cache (`bool`, *optional*): | |
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see | |
`past_key_values`). | |
output_attentions (`bool`, *optional*): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned | |
tensors for more detail. | |
output_hidden_states (`bool`, *optional*): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for | |
more detail. | |
return_dict (`bool`, *optional*): | |
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. | |
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): | |
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, | |
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer | |
the complete sequence length. | |
""" | |
class LlamaForCausalLMWithNumericalEmbedding(LlamaForCausalLM): | |
def __init__(self, config: LlamaConfig): | |
super().__init__(config) | |
self.numerical_embedding = torch.nn.Linear(1, config.hidden_size, bias=True) | |
def forward( | |
self, | |
input_ids: torch.LongTensor = None, | |
properties: List = None, | |
properties_index: List = None, | |
attention_mask: Optional[torch.Tensor] = None, | |
position_ids: Optional[torch.LongTensor] = None, | |
past_key_values: Optional[List[torch.FloatTensor]] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
cache_position=None, | |
labels: Optional[torch.LongTensor] = None, | |
use_cache: Optional[bool] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple, CausalLMOutputWithPast]: | |
b, l = input_ids.size() | |
assert len(properties) == b, "The number of properties should be equal to the batch size." | |
assert len(properties_index) == b, "The number of properties_index should be equal to the batch size." | |
embeddings = self.model.embed_tokens(input_ids) | |
for i, (props, props_index, embeds) in enumerate(zip(properties, properties_index, embeddings)): | |
assert len(props) == len(props_index), "The number of properties should be equal to the number of properties_index." | |
props = torch.tensor(props, device=embeds.device, dtype=torch.float32).unsqueeze(1) | |
print(props, "props") | |
num_embeds = self.numerical_embedding(props) | |
print(num_embeds, "num_embeds") | |
if len(props_index) > 0: | |
assert embeddings[i, props_index, :].shape == num_embeds.shape, "The shape of the embeddings and the numerical embeddings should be the same." | |
embeddings[i, props_index, :] = num_embeds | |
print(embeddings, "embedding_after") | |
return super().forward( | |
input_ids=None, | |
attention_mask=attention_mask, | |
position_ids=position_ids, | |
past_key_values=past_key_values, | |
inputs_embeds=embeddings, | |
labels=labels, | |
use_cache=use_cache, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) |