from transformers import CLIPTokenizer class SDXLTokenizer: """Wrapper around HuggingFace tokenizers for SDXL. Tokenizes prompt with two tokenizers and returns the joined output. Args: model_name (str): Name of the model's text encoders to load. Defaults to 'stabilityai/stable-diffusion-xl-base-1.0'. """ def __init__(self, file_path_or_name='stabilityai/stable-diffusion-xl-base-1.0'): self.tokenizer = CLIPTokenizer.from_pretrained(file_path_or_name, subfolder='tokenizer') self.tokenizer_2 = CLIPTokenizer.from_pretrained(file_path_or_name, subfolder='tokenizer_2') @classmethod def from_pretrained(cls, file_path_or_name='stabilityai/stable-diffusion-xl-base-1.0', **kwargs): """ Create a new instance of SDXLTextEncoder with specified pretrained model parameters. Args: file_path_or_name (str): Name or path of the model's text encoders to load. encode_latents_in_fp16 (bool): Whether to encode latents in fp16. torch_dtype (torch.dtype): Data type for model parameters. **kwargs: Additional keyword arguments. Returns: SDXLTextEncoder: A new instance of SDXLTextEncoder. """ # Update arguments with any additional kwargs init_args = {'file_path_or_name': file_path_or_name} init_args.update(kwargs) # Create and return a new instance of SDXLTextEncoder return cls(**init_args) def __call__(self, prompt, padding, truncation, return_tensors, max_length=None): tokenized_output = self.tokenizer( prompt, padding=padding, max_length=self.tokenizer.model_max_length if max_length is None else max_length, truncation=truncation, return_tensors=return_tensors) tokenized_output_2 = self.tokenizer_2( prompt, padding=padding, max_length=self.tokenizer_2.model_max_length if max_length is None else max_length, truncation=truncation, return_tensors=return_tensors) # Add second tokenizer output to first tokenizer for key in tokenized_output.keys(): tokenized_output[key] = [tokenized_output[key], tokenized_output_2[key]] return tokenized_output