"""This lobe enables the integration of huggingface pretrained GPT2LMHeadModel model plus the expanding embedding layer for additional tokens like BOS, EOS and Speakers . Transformer from HuggingFace needs to be installed: https://huggingface.co/transformers/installation.html Authors * Pooneh Mousavi 2023 """ import logging from torch import Tensor import torch import torch.nn as nn from speechbrain.lobes.models.huggingface_transformers.gpt import GPT try: from transformers import GPT2LMHeadModel from transformers import GPT2Tokenizer except ImportError: MSG = "Please install transformers from HuggingFace to use GPT2\n" MSG += "E.G. run: pip install transformers" raise ImportError(MSG) logger = logging.getLogger(__name__) class HuggingFaceGPT_expanded(GPT): """This lobe enables the integration of HuggingFace pretrained GPT model. Source paper whisper: https://life-extension.github.io/2020/05/27/GPT%E6%8A%80%E6%9C%AF%E5%88%9D%E6%8E%A2/language-models.pdf Transformer from HuggingFace needs to be installed: https://huggingface.co/transformers/installation.html The model can be finetuned. It will download automatically the model from HuggingFace or use a local path. Arguments --------- source : str HuggingFace hub name: e.g "gpt2" save_path : str Path (dir) of the downloaded model. freeze : bool (default: False) If True, the model is frozen. If False, the model will be trained alongside with the rest of the pipeline. Example ------- >>> model_hub = "gpt2" >>> save_path = "savedir" >>> model = HuggingFaceGPT(model_hub, save_path) >>> tokens = torch.tensor([[1, 1]]) >>> tokens_type = torch.tensor([[1, 1]]) >>> attention_mask = torch.tensor([[1, 1]]) >>> outputs = model(tokens, tokens_type, attention_mask) """ def __init__( self, *args, **kwrds ) -> None: super().__init__( *args, **kwrds) # Load tokenizer and add special tokens self.tokenizer = GPT2Tokenizer.from_pretrained(kwrds['source'], pad_token=None) # # Add special tokens to the tokenizer and resize model embedding # Special tokens bos_token = "BOS" eos_token="EOS" system_token= "SPK_1" user_token= "SPK_2" additional_special_tokens= [ system_token, user_token ] attr_to_special_tokens={"bos_token": bos_token, "eos_token": eos_token, "additional_special_tokens": additional_special_tokens} self.add_special_tokens_( attr_to_special_tokens ) def add_special_tokens_(self, attr_to_special_token,) -> None: orig_num_tokens = len(self.tokenizer.encoder) num_added_tokens = self.tokenizer.add_special_tokens( attr_to_special_token # type: ignore ) # doesn't add if they are already there if num_added_tokens > 0: self.model.resize_token_embeddings( new_num_tokens=orig_num_tokens + num_added_tokens )