from torchvision.io import read_image, ImageReadMode import torch import numpy as np from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize from torchvision.transforms.functional import InterpolationMode from transformers import MBart50TokenizerFast from PIL import Image class Transform(torch.nn.Module): def __init__(self, image_size): super().__init__() self.transforms = torch.nn.Sequential( Resize([image_size], interpolation=InterpolationMode.BICUBIC), CenterCrop(image_size), ConvertImageDtype(torch.float), Normalize( (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711), ), ) def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.no_grad(): x = self.transforms(x) return x transform = Transform(224) def get_transformed_image(image): if image.shape[-1] == 3 and isinstance(image, np.ndarray): image = image.transpose(2, 0, 1) image = torch.tensor(image) return transform(image).unsqueeze(0).permute(0, 2, 3, 1).numpy() tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50") language_mapping = { "english": "en_XX", "german": "de_DE", "french": "fr_XX", "spanish": "es_XX" } def generate_sequence(model, pixel_values, lang_code): lang_code = language_mapping[lang_code] output_ids = model.generate(input_ids=pixel_values, decoder_start_token_id=tokenizer.lang_code_to_id[lang_code], max_length=64, num_beams=4) output_sequence = tokenizer.batch_decode(output_ids[0], skip_special_tokens=True, max_length=64) return output_sequence