mbiswas's picture
Upload 10 files
b781107 verified
from constants import *
from transformers import AutoTokenizer
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
def get_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("gpt2")
point_tokens = [f"coord_bin_{i}" for i in range(0, NUM_BINS)]
new_tokens = [
"<point_start>", "<point_end>", "<result_start>",
"<result_end>", "<pointx_start>", "<pointx_end>",
"<pointy_start>", "<pointy_end>",
*point_tokens
]
tokenizer.add_tokens(new_tokens)
# Ensure pad token is set (GPT2 usually doesn't have one by default)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Or use eos_token if preferred
# tokenizer.pad_token_id = tokenizer.eos_token_id # Alternative if we want padding to be EOS
print(f"Tokenizer pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"Tokenizer EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
# Check if pad token ID is valid
if tokenizer.pad_token_id is None:
raise ValueError("Tokenizer pad token ID is not set!")
return tokenizer, len(tokenizer)
def image_to_tensor(image, image_size=IMAGE_SIZE):
if image.mode != 'RGB':
image = image.convert('RGB')
# We avoid the hassle of calculating
# changed co-ordinates for rotation etc for now. Can be added later.
transform = transforms.Compose([
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD)
])
return transform(image)
def tensor_to_image(tensor):
tensor = tensor.clone().detach()
if tensor.is_cuda:
tensor = tensor.cpu()
mean = torch.tensor(IMAGE_MEAN).view(3, 1, 1)
std = torch.tensor(IMAGE_STD).view(3, 1, 1)
tensor = tensor * std + mean
tensor = torch.clamp(tensor, 0, 1)
image_np = tensor.numpy().transpose(1, 2, 0)
image_np = (image_np * 255).astype(np.uint8)
return Image.fromarray(image_np)
tokenizer, vocab_size = get_tokenizer() # Initialize tokenizer globally