|
from constants import * |
|
from transformers import AutoTokenizer |
|
import torch |
|
import numpy as np |
|
from PIL import Image |
|
from torchvision import transforms |
|
|
|
|
|
def get_tokenizer(): |
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
point_tokens = [f"coord_bin_{i}" for i in range(0, NUM_BINS)] |
|
new_tokens = [ |
|
"<point_start>", "<point_end>", "<result_start>", |
|
"<result_end>", "<pointx_start>", "<pointx_end>", |
|
"<pointy_start>", "<pointy_end>", |
|
*point_tokens |
|
] |
|
tokenizer.add_tokens(new_tokens) |
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
|
|
|
print(f"Tokenizer pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}") |
|
print(f"Tokenizer EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}") |
|
|
|
|
|
if tokenizer.pad_token_id is None: |
|
raise ValueError("Tokenizer pad token ID is not set!") |
|
|
|
return tokenizer, len(tokenizer) |
|
|
|
def image_to_tensor(image, image_size=IMAGE_SIZE): |
|
if image.mode != 'RGB': |
|
image = image.convert('RGB') |
|
|
|
|
|
transform = transforms.Compose([ |
|
transforms.Resize((image_size, image_size)), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD) |
|
]) |
|
return transform(image) |
|
|
|
def tensor_to_image(tensor): |
|
tensor = tensor.clone().detach() |
|
if tensor.is_cuda: |
|
tensor = tensor.cpu() |
|
mean = torch.tensor(IMAGE_MEAN).view(3, 1, 1) |
|
std = torch.tensor(IMAGE_STD).view(3, 1, 1) |
|
tensor = tensor * std + mean |
|
tensor = torch.clamp(tensor, 0, 1) |
|
image_np = tensor.numpy().transpose(1, 2, 0) |
|
image_np = (image_np * 255).astype(np.uint8) |
|
return Image.fromarray(image_np) |
|
|
|
tokenizer, vocab_size = get_tokenizer() |