# Copyright (c) OpenMMLab. All rights reserved. import collections import os from mmengine.fileio import list_from_file from transformers import (AutoTokenizer, BartTokenizer, BasicTokenizer, BertTokenizer, BertTokenizerFast, LlamaTokenizer, WordpieceTokenizer) from mmpretrain.registry import TOKENIZER from .huggingface import register_hf_tokenizer register_hf_tokenizer(AutoTokenizer) register_hf_tokenizer(LlamaTokenizer) @register_hf_tokenizer() class BlipTokenizer(BertTokenizerFast): """"BlipTokenizer inherit BertTokenizerFast (fast, Rust-based).""" @classmethod def from_pretrained( cls, pretrained_model_name_or_path, *init_inputs, **kwargs, ): os.environ['TOKENIZERS_PARALLELISM'] = 'true' tokenizer = super().from_pretrained( pretrained_model_name_or_path, *init_inputs, **kwargs, ) tokenizer.add_special_tokens({'bos_token': '[DEC]'}) tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']}) return tokenizer @register_hf_tokenizer() class Blip2Tokenizer(BertTokenizer): @classmethod def from_pretrained( cls, pretrained_model_name_or_path, *init_inputs, **kwargs, ): tokenizer = super().from_pretrained( pretrained_model_name_or_path, *init_inputs, **kwargs, ) tokenizer.add_special_tokens({'bos_token': '[DEC]'}) return tokenizer @register_hf_tokenizer() class OFATokenizer(BartTokenizer): vocab_files_names = { 'vocab_file': 'vocab.json', 'merges_file': 'merges.txt' } pretrained_vocab_files_map = { 'vocab_file': { 'OFA-Sys/OFA-tiny': 'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/vocab.json', 'OFA-Sys/OFA-medium': 'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/vocab.json', 'OFA-Sys/OFA-base': 'https://huggingface.co/OFA-Sys/OFA-base/blob/main/vocab.json', 'OFA-Sys/OFA-large': 'https://huggingface.co/OFA-Sys/OFA-large/blob/main/vocab.json', }, 'merges_file': { 'OFA-Sys/OFA-tiny': 'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/merges.txt', 'OFA-Sys/OFA-medium': 'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/merges.txt', 'OFA-Sys/OFA-base': 'https://huggingface.co/OFA-Sys/OFA-base/blob/main/merges.txt', 'OFA-Sys/OFA-large': 'https://huggingface.co/OFA-Sys/OFA-large/blob/main/merges.txt', }, } max_model_input_sizes = { 'OFA-Sys/OFA-tiny': 1024, 'OFA-Sys/OFA-medium': 1024, 'OFA-Sys/OFA-base': 1024, 'OFA-Sys/OFA-large': 1024, } @classmethod def from_pretrained( cls, pretrained_model_name_or_path, *init_inputs, **kwargs, ): num_bins = kwargs.pop('num_bins', 1000) tokenizer = super().from_pretrained( pretrained_model_name_or_path, *init_inputs, **kwargs, ) length = len(tokenizer) tokenizer.add_tokens([''.format(i) for i in range(8192)]) tokenizer.code_offset = length tokenizer.add_tokens([''.format(i) for i in range(num_bins)]) tokenizer.bin_offset = length + 8192 tokenizer.num_bins = num_bins return tokenizer @TOKENIZER.register_module() class FullTokenizer(BertTokenizer): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = self.load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token='[UNK]', max_input_chars_per_word=200) def load_vocab(self, vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 vocab_list = list_from_file(vocab_file) for token in vocab_list: if not token: break token = token.strip() vocab[token] = index index += 1 return vocab def tokenize(self, text): split_tokens = [] for token in self.basic_tokenizer.tokenize(text): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) return split_tokens def convert_by_vocab(self, vocab, items): """Converts a sequence of [tokens|ids] using the vocab.""" output = [] for item in items: output.append(vocab[item]) return output def convert_tokens_to_ids(self, tokens): return self.convert_by_vocab(self.vocab, tokens) def convert_ids_to_tokens(self, ids): return self.convert_by_vocab(self.inv_vocab, ids) @staticmethod def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): """Converts a sequence of tokens (string) in a single string.""" def clean_up_tokenization(out_string): """Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.""" out_string = ( out_string.replace(' .', '.').replace(' ?', '?').replace( ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace( " n't", "n't").replace(" 'm", "'m").replace( " 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")) return out_string text = ' '.join(tokens).replace(' ##', '').strip() if clean_up_tokenization_spaces: clean_text = clean_up_tokenization(text) return clean_text else: return text def vocab_size(self): return len(self.vocab)