Spaces:
Sleeping
Sleeping
import transformers | |
from transformers.models.t5.modeling_t5 import * | |
from transformers.models.t5.modeling_t5 import T5Stack | |
import os | |
import gdown | |
import torch | |
from typing import * | |
from transformers import T5ForConditionalGeneration, AutoTokenizer | |
from utils.config import Config | |
from src.feature_extraction import ViT, OCR | |
from bertviz import model_view, head_view | |
from src.image_visualization import plot_attention | |
import numpy as np | |
from PIL import Image | |
_CONFIG_FOR_DOC = "T5Config" | |
_CHECKPOINT_FOR_DOC = "google-t5/t5-small" | |
class CustomT5Stack(T5Stack): | |
def forward( | |
self, | |
input_ids=None, | |
attention_mask=None, | |
encoder_hidden_states=None, | |
encoder_attention_mask=None, | |
inputs_embeds=None, | |
head_mask=None, | |
cross_attn_head_mask=None, | |
past_key_values=None, | |
use_cache=None, | |
output_attentions=None, | |
output_hidden_states=None, | |
return_dict=None, | |
images_embeds=None, | |
): | |
# Model parallel | |
if self.model_parallel: | |
torch.cuda.set_device(self.first_device) | |
self.embed_tokens = self.embed_tokens.to(self.first_device) | |
use_cache = use_cache if use_cache is not None else self.config.use_cache | |
output_attentions = ( | |
output_attentions | |
if output_attentions is not None | |
else self.config.output_attentions | |
) | |
output_hidden_states = ( | |
output_hidden_states | |
if output_hidden_states is not None | |
else self.config.output_hidden_states | |
) | |
return_dict = ( | |
return_dict if return_dict is not None else self.config.use_return_dict | |
) | |
if input_ids is not None and inputs_embeds is not None: | |
err_msg_prefix = "decoder_" if self.is_decoder else "" | |
raise ValueError( | |
f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" | |
) | |
elif input_ids is not None: | |
input_shape = input_ids.size() | |
input_ids = input_ids.view(-1, input_shape[-1]) | |
elif inputs_embeds is not None: | |
input_shape = inputs_embeds.size()[:-1] | |
else: | |
err_msg_prefix = "decoder_" if self.is_decoder else "" | |
raise ValueError( | |
f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds" | |
) | |
if inputs_embeds is None: | |
if self.embed_tokens is None: | |
raise ValueError( | |
"You have to initialize the model with valid token embeddings" | |
) | |
inputs_embeds = self.embed_tokens(input_ids) | |
if not self.is_decoder and images_embeds is not None: | |
inputs_embeds = torch.concat([inputs_embeds, images_embeds], dim=1) | |
input_shape = inputs_embeds.size()[:-1] | |
batch_size, seq_length = input_shape | |
# required mask seq length can be calculated via length of past | |
mask_seq_length = ( | |
past_key_values[0][0].shape[2] + seq_length | |
if past_key_values is not None | |
else seq_length | |
) | |
if use_cache is True: | |
if not self.is_decoder: | |
raise ValueError( | |
f"`use_cache` can only be set to `True` if {self} is used as a decoder" | |
) | |
# initialize past_key_values with `None` if past does not exist | |
if past_key_values is None: | |
past_key_values = [None] * len(self.block) | |
if attention_mask is None: | |
attention_mask = torch.ones( | |
batch_size, mask_seq_length, device=inputs_embeds.device | |
) | |
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] | |
# ourselves in which case we just need to make it broadcastable to all heads. | |
extended_attention_mask = self.get_extended_attention_mask( | |
attention_mask, input_shape | |
) | |
# If a 2D or 3D attention mask is provided for the cross-attention | |
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] | |
if self.is_decoder and encoder_hidden_states is not None: | |
encoder_batch_size, encoder_sequence_length, _ = ( | |
encoder_hidden_states.size() | |
) | |
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) | |
if encoder_attention_mask is None: | |
encoder_attention_mask = torch.ones( | |
encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long | |
) | |
encoder_extended_attention_mask = self.invert_attention_mask( | |
encoder_attention_mask | |
) | |
else: | |
encoder_extended_attention_mask = None | |
if self.gradient_checkpointing and self.training: | |
if use_cache: | |
logger.warning_once( | |
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." | |
) | |
use_cache = False | |
# Prepare head mask if needed | |
head_mask = self.get_head_mask(head_mask, self.config.num_layers) | |
cross_attn_head_mask = self.get_head_mask( | |
cross_attn_head_mask, self.config.num_layers | |
) | |
present_key_value_states = () if use_cache else None | |
all_hidden_states = () if output_hidden_states else None | |
all_attentions = () if output_attentions else None | |
all_cross_attentions = () if (output_attentions and self.is_decoder) else None | |
position_bias = None | |
encoder_decoder_position_bias = None | |
hidden_states = self.dropout(inputs_embeds) | |
for i, (layer_module, past_key_value) in enumerate( | |
zip(self.block, past_key_values) | |
): | |
layer_head_mask = head_mask[i] | |
cross_attn_layer_head_mask = cross_attn_head_mask[i] | |
# Model parallel | |
if self.model_parallel: | |
torch.cuda.set_device(hidden_states.device) | |
# Ensure that attention_mask is always on the same device as hidden_states | |
if attention_mask is not None: | |
attention_mask = attention_mask.to(hidden_states.device) | |
if position_bias is not None: | |
position_bias = position_bias.to(hidden_states.device) | |
if encoder_hidden_states is not None: | |
encoder_hidden_states = encoder_hidden_states.to( | |
hidden_states.device | |
) | |
if encoder_extended_attention_mask is not None: | |
encoder_extended_attention_mask = ( | |
encoder_extended_attention_mask.to(hidden_states.device) | |
) | |
if encoder_decoder_position_bias is not None: | |
encoder_decoder_position_bias = encoder_decoder_position_bias.to( | |
hidden_states.device | |
) | |
if layer_head_mask is not None: | |
layer_head_mask = layer_head_mask.to(hidden_states.device) | |
if cross_attn_layer_head_mask is not None: | |
cross_attn_layer_head_mask = cross_attn_layer_head_mask.to( | |
hidden_states.device | |
) | |
if output_hidden_states: | |
all_hidden_states = all_hidden_states + (hidden_states,) | |
if self.gradient_checkpointing and self.training: | |
layer_outputs = self._gradient_checkpointing_func( | |
layer_module.forward, | |
hidden_states, | |
extended_attention_mask, | |
position_bias, | |
encoder_hidden_states, | |
encoder_extended_attention_mask, | |
encoder_decoder_position_bias, | |
layer_head_mask, | |
cross_attn_layer_head_mask, | |
None, # past_key_value is always None with gradient checkpointing | |
use_cache, | |
output_attentions, | |
) | |
else: | |
layer_outputs = layer_module( | |
hidden_states, | |
attention_mask=extended_attention_mask, | |
position_bias=position_bias, | |
encoder_hidden_states=encoder_hidden_states, | |
encoder_attention_mask=encoder_extended_attention_mask, | |
encoder_decoder_position_bias=encoder_decoder_position_bias, | |
layer_head_mask=layer_head_mask, | |
cross_attn_layer_head_mask=cross_attn_layer_head_mask, | |
past_key_value=past_key_value, | |
use_cache=use_cache, | |
output_attentions=output_attentions, | |
) | |
# layer_outputs is a tuple with: | |
# hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) | |
if use_cache is False: | |
layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] | |
hidden_states, present_key_value_state = layer_outputs[:2] | |
# We share the position biases between the layers - the first layer store them | |
# layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), | |
# (cross-attention position bias), (cross-attention weights) | |
position_bias = layer_outputs[2] | |
if self.is_decoder and encoder_hidden_states is not None: | |
encoder_decoder_position_bias = layer_outputs[ | |
4 if output_attentions else 3 | |
] | |
# append next layer key value states | |
if use_cache: | |
present_key_value_states = present_key_value_states + ( | |
present_key_value_state, | |
) | |
if output_attentions: | |
all_attentions = all_attentions + (layer_outputs[3],) | |
if self.is_decoder: | |
all_cross_attentions = all_cross_attentions + (layer_outputs[5],) | |
# Model Parallel: If it's the last layer for that device, put things on the next device | |
if self.model_parallel: | |
for k, v in self.device_map.items(): | |
if i == v[-1] and "cuda:" + str(k) != self.last_device: | |
hidden_states = hidden_states.to("cuda:" + str(k + 1)) | |
hidden_states = self.final_layer_norm(hidden_states) | |
hidden_states = self.dropout(hidden_states) | |
# Add last layer | |
if output_hidden_states: | |
all_hidden_states = all_hidden_states + (hidden_states,) | |
if not return_dict: | |
return tuple( | |
v | |
for v in [ | |
hidden_states, | |
present_key_value_states, | |
all_hidden_states, | |
all_attentions, | |
all_cross_attentions, | |
] | |
if v is not None | |
) | |
return BaseModelOutputWithPastAndCrossAttentions( | |
last_hidden_state=hidden_states, | |
past_key_values=present_key_value_states, | |
hidden_states=all_hidden_states, | |
attentions=all_attentions, | |
cross_attentions=all_cross_attentions, | |
) | |
class CustomT5ForConditionalGeneration(T5ForConditionalGeneration): | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
decoder_input_ids: Optional[torch.LongTensor] = None, | |
decoder_attention_mask: Optional[torch.BoolTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
decoder_head_mask: Optional[torch.FloatTensor] = None, | |
cross_attn_head_mask: Optional[torch.Tensor] = None, | |
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, | |
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, | |
labels: Optional[torch.LongTensor] = None, | |
use_cache: Optional[bool] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
images_embeds: Optional[torch.FloatTensor] = None, | |
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | |
Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for | |
labels in `[0, ..., config.vocab_size]` | |
Returns: | |
Examples: | |
```python | |
>>> from transformers import AutoTokenizer, T5ForConditionalGeneration | |
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") | |
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") | |
>>> # training | |
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids | |
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids | |
>>> outputs = model(input_ids=input_ids, labels=labels) | |
>>> loss = outputs.loss | |
>>> logits = outputs.logits | |
>>> # inference | |
>>> input_ids = tokenizer( | |
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" | |
... ).input_ids # Batch size 1 | |
>>> outputs = model.generate(input_ids) | |
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
>>> # studies have shown that owning a dog is good for you. | |
```""" | |
use_cache = use_cache if use_cache is not None else self.config.use_cache | |
return_dict = ( | |
return_dict if return_dict is not None else self.config.use_return_dict | |
) | |
# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask | |
if head_mask is not None and decoder_head_mask is None: | |
if self.config.num_layers == self.config.num_decoder_layers: | |
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) | |
decoder_head_mask = head_mask | |
# Encode if needed (training, first prediction pass) | |
if encoder_outputs is None: | |
# Convert encoder inputs in embeddings if needed | |
encoder_outputs = self.encoder( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
inputs_embeds=inputs_embeds, | |
head_mask=head_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
images_embeds=images_embeds, | |
) | |
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): | |
encoder_outputs = BaseModelOutput( | |
last_hidden_state=encoder_outputs[0], | |
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, | |
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, | |
) | |
hidden_states = encoder_outputs[0] | |
if self.model_parallel: | |
torch.cuda.set_device(self.decoder.first_device) | |
if ( | |
labels is not None | |
and decoder_input_ids is None | |
and decoder_inputs_embeds is None | |
): | |
# get decoder inputs from shifting lm labels to the right | |
decoder_input_ids = self._shift_right(labels) | |
# Set device for model parallelism | |
if self.model_parallel: | |
torch.cuda.set_device(self.decoder.first_device) | |
hidden_states = hidden_states.to(self.decoder.first_device) | |
if decoder_input_ids is not None: | |
decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) | |
if attention_mask is not None: | |
attention_mask = attention_mask.to(self.decoder.first_device) | |
if decoder_attention_mask is not None: | |
decoder_attention_mask = decoder_attention_mask.to( | |
self.decoder.first_device | |
) | |
# Decode | |
decoder_outputs = self.decoder( | |
input_ids=decoder_input_ids, | |
attention_mask=decoder_attention_mask, | |
inputs_embeds=decoder_inputs_embeds, | |
past_key_values=past_key_values, | |
encoder_hidden_states=hidden_states, | |
encoder_attention_mask=attention_mask, | |
head_mask=decoder_head_mask, | |
cross_attn_head_mask=cross_attn_head_mask, | |
use_cache=use_cache, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = decoder_outputs[0] | |
# Set device for model parallelism | |
if self.model_parallel: | |
torch.cuda.set_device(self.encoder.first_device) | |
self.lm_head = self.lm_head.to(self.encoder.first_device) | |
sequence_output = sequence_output.to(self.lm_head.weight.device) | |
if self.config.tie_word_embeddings: | |
# Rescale output before projecting on vocab | |
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 | |
sequence_output = sequence_output * (self.model_dim**-0.5) | |
lm_logits = self.lm_head(sequence_output) | |
loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss(ignore_index=-100) | |
# move labels to correct device to enable PP | |
labels = labels.to(lm_logits.device) | |
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) | |
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 | |
if not return_dict: | |
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs | |
return ((loss,) + output) if loss is not None else output | |
return Seq2SeqLMOutput( | |
loss=loss, | |
logits=lm_logits, | |
past_key_values=decoder_outputs.past_key_values, | |
decoder_hidden_states=decoder_outputs.hidden_states, | |
decoder_attentions=decoder_outputs.attentions, | |
cross_attentions=decoder_outputs.cross_attentions, | |
encoder_last_hidden_state=encoder_outputs.last_hidden_state, | |
encoder_hidden_states=encoder_outputs.hidden_states, | |
encoder_attentions=encoder_outputs.attentions, | |
) | |
transformers.models.t5.modeling_t5.T5Stack = CustomT5Stack | |
transformers.models.t5.modeling_t5.T5ForConditionalGeneration = ( | |
CustomT5ForConditionalGeneration | |
) | |
transformers.T5ForConditionalGeneration = CustomT5ForConditionalGeneration | |
from transformers import T5ForConditionalGeneration | |
class Model: | |
def __init__(self) -> None: | |
os.makedirs("storage", exist_ok=True) | |
if not os.path.exists("storage/vlsp_transfomer_vietocr.pth"): | |
print("DOWNLOADING model") | |
gdown.download( | |
Config.model_url, output="storage/vlsp_transfomer_vietocr.pth" | |
) | |
self.vit5_tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base") | |
self.model = T5ForConditionalGeneration.from_pretrained( | |
"truong-xuan-linh/VQA-vit5", | |
revision=Config.revision, | |
output_attentions=True, | |
) | |
self.model.to(Config.device) | |
self.vit = ViT() | |
self.ocr = OCR() | |
def get_inputs(self, image_dir: str, question: str): | |
# VIT | |
image_feature, image_mask = self.vit.extraction(image_dir) | |
ocr_content, groups_box, paragraph_boxes = self.ocr.extraction(image_dir) | |
print("Input: ", question + " " + ocr_content) | |
# VIT5 | |
input_ = self.vit5_tokenizer( | |
question + " " + ocr_content, | |
padding="max_length", | |
truncation=True, | |
max_length=Config.question_maxlen + Config.ocr_maxlen, | |
return_tensors="pt", | |
) | |
input_ids = input_.input_ids | |
attention_mask = input_.attention_mask | |
mask = torch.cat((attention_mask, image_mask), 1) | |
return { | |
"input_ids": input_ids, | |
"attention_mask": mask, | |
"images_embeds": image_feature, | |
} | |
def inference(self, image_dir: str, question: str, explain: bool = False): | |
inputs = self.get_inputs(image_dir, question) | |
with torch.no_grad(): | |
input_ids = inputs["input_ids"] | |
attention_mask = inputs["attention_mask"] | |
images_embeds = inputs["images_embeds"] | |
generated_ids = self.model.generate( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
images_embeds=images_embeds, | |
num_beams=2, | |
max_length=Config.answer_maxlen, | |
) | |
pred_answer = self.vit5_tokenizer.decode( | |
generated_ids[0], skip_special_tokens=True | |
) | |
if not explain: | |
return pred_answer, None, None | |
with self.vit5_tokenizer.as_target_tokenizer(): | |
decoder_input_ids = self.vit5_tokenizer( | |
pred_answer, return_tensors="pt", add_special_tokens=True | |
).input_ids | |
with torch.no_grad(): | |
outputs = self.model( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
images_embeds=images_embeds, | |
decoder_input_ids=decoder_input_ids, | |
) | |
encoder_text = self.vit5_tokenizer.convert_ids_to_tokens(input_ids[0]) | |
decoder_text = self.vit5_tokenizer.convert_ids_to_tokens(decoder_input_ids[0]) | |
while "<pad>" in encoder_text: | |
encoder_text.remove("<pad>") | |
text_encoder_attentions = [ | |
att[:, :, : len(encoder_text), : len(encoder_text)] | |
for att in outputs.encoder_attentions | |
] | |
text_cross_attentions = [ | |
att[:, :, :, : len(encoder_text)] for att in outputs.cross_attentions | |
] | |
html_output = head_view( | |
encoder_attention=text_encoder_attentions, | |
decoder_attention=outputs.decoder_attentions, | |
cross_attention=text_cross_attentions, | |
encoder_tokens=encoder_text[: len(encoder_text)], | |
decoder_tokens=decoder_text, | |
# display_mode="light", | |
html_action="return", | |
) | |
img = Image.open(image_dir).convert("RGB") | |
image_dirs = [] | |
for i in range(len(outputs.cross_attentions[:1])): | |
image_dir = f"visualization/test_image_visualize_{i}.jpg" | |
image_dirs.append(image_dir) | |
attention_plot = np.mean( | |
outputs.cross_attentions[i][0, :, :, -197:].detach().numpy(), axis=0 | |
) | |
plot_attention(img, decoder_text, attention_plot, image_dir) | |
return pred_answer, html_output.data, image_dirs | |