from constants import * from transformers import AutoTokenizer import torch import numpy as np from PIL import Image from torchvision import transforms def get_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("gpt2") point_tokens = [f"coord_bin_{i}" for i in range(0, NUM_BINS)] new_tokens = [ "", "", "", "", "", "", "", "", *point_tokens ] tokenizer.add_tokens(new_tokens) # Ensure pad token is set (GPT2 usually doesn't have one by default) if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Or use eos_token if preferred # tokenizer.pad_token_id = tokenizer.eos_token_id # Alternative if we want padding to be EOS print(f"Tokenizer pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}") print(f"Tokenizer EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}") # Check if pad token ID is valid if tokenizer.pad_token_id is None: raise ValueError("Tokenizer pad token ID is not set!") return tokenizer, len(tokenizer) def image_to_tensor(image, image_size=IMAGE_SIZE): if image.mode != 'RGB': image = image.convert('RGB') # We avoid the hassle of calculating # changed co-ordinates for rotation etc for now. Can be added later. transform = transforms.Compose([ transforms.Resize((image_size, image_size)), transforms.ToTensor(), transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD) ]) return transform(image) def tensor_to_image(tensor): tensor = tensor.clone().detach() if tensor.is_cuda: tensor = tensor.cpu() mean = torch.tensor(IMAGE_MEAN).view(3, 1, 1) std = torch.tensor(IMAGE_STD).view(3, 1, 1) tensor = tensor * std + mean tensor = torch.clamp(tensor, 0, 1) image_np = tensor.numpy().transpose(1, 2, 0) image_np = (image_np * 255).astype(np.uint8) return Image.fromarray(image_np) tokenizer, vocab_size = get_tokenizer() # Initialize tokenizer globally