poonehmousavi commited on
Commit
8f5a2a0
1 Parent(s): d625330

Upload 2 files

Browse files
Files changed (2) hide show
  1. custom.py +94 -0
  2. hyperparams.yaml +68 -0
custom.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This lobe enables the integration of huggingface pretrained GPT2LMHeadModel model plus the expanding embedding layer for additional tokens like BOS, EOS and Speakers .
2
+
3
+ Transformer from HuggingFace needs to be installed:
4
+ https://huggingface.co/transformers/installation.html
5
+
6
+ Authors
7
+ * Pooneh Mousavi 2023
8
+ """
9
+
10
+ import logging
11
+ from torch import Tensor
12
+ import torch
13
+ import torch.nn as nn
14
+ from speechbrain.lobes.models.huggingface_gpt import HuggingFaceGPT
15
+ try:
16
+ from transformers import GPT2LMHeadModel
17
+ from transformers import GPT2Tokenizer
18
+ except ImportError:
19
+ MSG = "Please install transformers from HuggingFace to use GPT2\n"
20
+ MSG += "E.G. run: pip install transformers"
21
+ raise ImportError(MSG)
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class HuggingFaceGPT_expanded(HuggingFaceGPT):
27
+ """This lobe enables the integration of HuggingFace pretrained GPT model.
28
+ Source paper whisper:
29
+ https://life-extension.github.io/2020/05/27/GPT%E6%8A%80%E6%9C%AF%E5%88%9D%E6%8E%A2/language-models.pdf
30
+ Transformer from HuggingFace needs to be installed:
31
+ https://huggingface.co/transformers/installation.html
32
+
33
+ The model can be finetuned. It will download automatically the model from
34
+ HuggingFace or use a local path.
35
+
36
+ Arguments
37
+ ---------
38
+ source : str
39
+ HuggingFace hub name: e.g "gpt2"
40
+ save_path : str
41
+ Path (dir) of the downloaded model.
42
+ freeze : bool (default: False)
43
+ If True, the model is frozen. If False, the model will be trained
44
+ alongside with the rest of the pipeline.
45
+ Example
46
+ -------
47
+ >>> model_hub = "gpt2"
48
+ >>> save_path = "savedir"
49
+ >>> model = HuggingFaceGPT(model_hub, save_path)
50
+ >>> tokens = torch.tensor([[1, 1]])
51
+ >>> tokens_type = torch.tensor([[1, 1]])
52
+ >>> attention_mask = torch.tensor([[1, 1]])
53
+ >>> outputs = model(tokens, tokens_type, attention_mask)
54
+ """
55
+
56
+ def __init__(
57
+ self, *args, **kwrds
58
+ ) -> None:
59
+ super().__init__( *args, **kwrds)
60
+ # Load tokenizer and add special tokens
61
+ self.tokenizer = GPT2Tokenizer.from_pretrained(kwrds['source'], pad_token=None)
62
+ # # Add special tokens to the tokenizer and resize model embedding
63
+ # Special tokens
64
+ bos_token = "BOS"
65
+ eos_token="EOS"
66
+
67
+ system_token= "SPK_1"
68
+ user_token= "SPK_2"
69
+
70
+ additional_special_tokens= [
71
+ system_token,
72
+ user_token
73
+ ]
74
+
75
+ attr_to_special_tokens={"bos_token": bos_token,
76
+ "eos_token": eos_token,
77
+ "additional_special_tokens": additional_special_tokens}
78
+
79
+
80
+ self.add_special_tokens_(
81
+ attr_to_special_tokens
82
+ )
83
+
84
+ def add_special_tokens_(self, attr_to_special_token,) -> None:
85
+ orig_num_tokens = len(self.tokenizer.encoder)
86
+ num_added_tokens = self.tokenizer.add_special_tokens(
87
+ attr_to_special_token # type: ignore
88
+ ) # doesn't add if they are already there
89
+ if num_added_tokens > 0:
90
+ self.model.resize_token_embeddings(
91
+ new_num_tokens=orig_num_tokens + num_added_tokens
92
+ )
93
+
94
+
hyperparams.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: GPT2LMHeadModel + NLL
3
+ # Authors:
4
+ # Pooneh Mousavi 2023
5
+ # ################################
6
+
7
+
8
+ # URL for the gpt2 model
9
+ gpt_hub: gpt2
10
+ gpt_folder: gpt2_result/save/gpt_checkpoint
11
+ # Special tokens
12
+ bos_token: "BOS"
13
+ eos_token: "EOS"
14
+
15
+ system_token: "SPK_1"
16
+ user_token: "SPK_2"
17
+
18
+ tokenizer: !ref <gpt_hub>
19
+
20
+ additional_special_tokens: [
21
+ !ref <system_token>,
22
+ !ref <user_token>
23
+ ]
24
+
25
+ special_tokens: [
26
+ !ref <bos_token>,
27
+ !ref <eos_token>,
28
+ !ref <system_token>,
29
+ !ref <user_token>
30
+ ]
31
+
32
+ attr_to_special_tokens:
33
+ "bos_token": !ref <bos_token>
34
+ "eos_token": !ref <eos_token>
35
+ "additional_special_tokens": !ref <additional_special_tokens>
36
+
37
+ # history_window, i.e. how many user-system exchanges consider as context.
38
+ max_history: 5
39
+
40
+ # decoder setting
41
+ freeze_gptmodel: True
42
+ num_beams: 3
43
+ max_new_tokens: 50
44
+ top_k: 45
45
+ top_p: 0.9
46
+
47
+ # gpt model
48
+ model: !new:custom.HuggingFaceGPT_expanded
49
+ source: !ref <gpt_hub>
50
+ freeze: !ref <freeze_gptmodel>
51
+ save_path: !ref <gpt_folder>
52
+ max_new_tokens: !ref <max_new_tokens>
53
+ num_beams: !ref <num_beams>
54
+ top_k: !ref <top_k>
55
+ top_p: !ref <top_p>
56
+
57
+
58
+
59
+ # Masks
60
+ padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask
61
+
62
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
63
+ loadables:
64
+ model: !ref <model>
65
+
66
+ modules:
67
+ model: !ref <model>
68
+