Sunbread commited on
Commit
1e53095
1 Parent(s): 0918216

update model & inference

Browse files

- replace amsgrad with sgd+momentum (beta=0.9)
- set lr=0.2, and decay it by half every 32 epochs
- shrink h from 256 to 192
- separate hidden and latent, and set h_latent=64
- apply gradient clipping by L2 norm (max_norm=1)
- increase epochs from 160 to 192
- add tqdm progress bar display
- integrate incremental generation during inference

Files changed (3) hide show
  1. decoder.pt +2 -2
  2. inference.py +16 -7
  3. model.py +37 -19
decoder.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3887ea54b29b8d19669b4020dff46804593262ec49ef159816925274fc418572
3
- size 3925384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deeea664de143a71c87e67ba2af78aa88320fcd401c2c12a40183060f78b0e15
3
+ size 2078336
inference.py CHANGED
@@ -6,16 +6,21 @@ import torch.nn as nn
6
  import torch.nn.functional as F
7
 
8
  class DecoderGRU(nn.Module):
9
- def __init__(self, hidden_size, output_size):
10
  super(DecoderGRU, self).__init__()
11
- self.proj = nn.Linear(hidden_size, 2 * hidden_size)
 
 
12
  self.embedding = nn.Embedding(output_size, hidden_size)
13
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True)
14
  self.out = nn.Linear(hidden_size, output_size)
15
 
16
  def forward(self, encoder_sample, target_tensor=None, max_length=16):
17
  batch_size = encoder_sample.size(0)
18
- decoder_hidden = self.proj(encoder_sample).view(batch_size, 2, -1).permute(1, 0, 2).contiguous()
 
 
 
19
  if target_tensor is not None:
20
  decoder_input = target_tensor
21
  decoder_outputs, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
@@ -46,8 +51,9 @@ katakana = list('゠ァアィイゥウェエォオカガキギクグケゲコゴ
46
  vocab = ['<pad>', '<sos>', '<eos>'] + katakana
47
  vocab_dict = {v: k for k, v in enumerate(vocab)}
48
 
49
- h=256
50
  max_len=40
 
51
 
52
  def detokenize(tokens):
53
  if EOS_token in tokens:
@@ -55,6 +61,9 @@ def detokenize(tokens):
55
  else:
56
  return None
57
 
58
- for name in [detokenize(seq) for seq in dec(torch.randn(16,h), max_length=max_len)[0].topk(1)[1].squeeze().tolist()]:
59
- if name is not None:
60
- print(name)
 
 
 
 
6
  import torch.nn.functional as F
7
 
8
  class DecoderGRU(nn.Module):
9
+ def __init__(self, latent_size, hidden_size, output_size):
10
  super(DecoderGRU, self).__init__()
11
+ self.proj1 = nn.Linear(latent_size, latent_size)
12
+ self.proj_activation = nn.ReLU()
13
+ self.proj2 = nn.Linear(latent_size, 2 * hidden_size)
14
  self.embedding = nn.Embedding(output_size, hidden_size)
15
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True)
16
  self.out = nn.Linear(hidden_size, output_size)
17
 
18
  def forward(self, encoder_sample, target_tensor=None, max_length=16):
19
  batch_size = encoder_sample.size(0)
20
+ decoder_hidden = self.proj1(encoder_sample)
21
+ decoder_hidden = self.proj_activation(decoder_hidden)
22
+ decoder_hidden = self.proj2(decoder_hidden)
23
+ decoder_hidden = decoder_hidden.view(batch_size, 2, -1).permute(1, 0, 2).contiguous()
24
  if target_tensor is not None:
25
  decoder_input = target_tensor
26
  decoder_outputs, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
 
51
  vocab = ['<pad>', '<sos>', '<eos>'] + katakana
52
  vocab_dict = {v: k for k, v in enumerate(vocab)}
53
 
54
+ h_latent=64
55
  max_len=40
56
+ names=16
57
 
58
  def detokenize(tokens):
59
  if EOS_token in tokens:
 
61
  else:
62
  return None
63
 
64
+ while True:
65
+ print('generating names...')
66
+ for name in [detokenize(seq) for seq in dec(torch.randn(names,h_latent), max_length=max_len)[0].topk(1)[1].squeeze().tolist()]:
67
+ if name is not None:
68
+ print(name)
69
+ input("press enter to continue generation...")
model.py CHANGED
@@ -5,6 +5,8 @@ import torch
5
  import torch.nn as nn
6
  from torch import optim
7
  from torch.utils.data import DataLoader, Dataset
 
 
8
  import torch.nn.functional as F
9
  import pandas as pd
10
 
@@ -19,11 +21,16 @@ vocab_dict = {v: k for k, v in enumerate(vocab)}
19
 
20
  texts = pd.read_csv('rolename.txt', header=None)[0].tolist()
21
  vocab_size=len(vocab)
22
- h=256
 
23
  max_len=40
24
  bs=128
25
- lr=5e-4
26
- epochs=160
 
 
 
 
27
 
28
  def tokenize(text):
29
  return [vocab_dict[ch] for ch in text]
@@ -50,15 +57,15 @@ class BatchNormVAE(nn.Module): # https://spaces.ac.cn/archives/7381/
50
  return mu*scale_mu, sigma*scale_sigma
51
 
52
  class EncoderVAEBiGRU(nn.Module):
53
- def __init__(self, input_size, hidden_size, dropout_p=0.1):
54
  super(EncoderVAEBiGRU, self).__init__()
55
  self.hidden_size = hidden_size
56
  self.embedding = nn.Embedding(input_size, hidden_size)
57
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True, bidirectional=True)
58
- self.proj_mu = nn.Linear(4 * hidden_size, hidden_size)
59
- self.proj_sigma = nn.Linear(4 * hidden_size, hidden_size)
60
  self.dropout = nn.Dropout(dropout_p)
61
- self.bn = BatchNormVAE(hidden_size)
62
 
63
  def forward(self, input, input_lengths):
64
  input_lengths = input_lengths.to('cpu')
@@ -76,16 +83,21 @@ class EncoderVAEBiGRU(nn.Module):
76
  return eps * sigma + mu # var is sigma^2
77
 
78
  class DecoderGRU(nn.Module):
79
- def __init__(self, hidden_size, output_size):
80
  super(DecoderGRU, self).__init__()
81
- self.proj = nn.Linear(hidden_size, 2 * hidden_size)
 
 
82
  self.embedding = nn.Embedding(output_size, hidden_size)
83
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True)
84
  self.out = nn.Linear(hidden_size, output_size)
85
 
86
  def forward(self, encoder_sample, target_tensor=None, max_length=16):
87
  batch_size = encoder_sample.size(0)
88
- decoder_hidden = self.proj(encoder_sample).view(batch_size, 2, -1).permute(1, 0, 2).contiguous()
 
 
 
89
  if target_tensor is not None:
90
  decoder_input = target_tensor
91
  decoder_outputs, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
@@ -136,7 +148,7 @@ dataloader = DataLoader(
136
  generator=torch.Generator(device='cuda'),
137
  )
138
 
139
- def train_epoch(dataloader, encoder, decoder, optimizer):
140
  total_loss = 0
141
  nll = nn.NLLLoss()
142
  for enc_text, enc_len, input_text, target_text in dataloader:
@@ -150,19 +162,25 @@ def train_epoch(dataloader, encoder, decoder, optimizer):
150
  loss = loss_recons + loss_kld
151
  loss.backward()
152
 
 
 
 
153
  optimizer.step()
154
 
155
  total_loss += loss.item()
156
  return total_loss / len(dataloader)
157
 
158
- enc = EncoderVAEBiGRU(vocab_size, h).train()
159
- dec = DecoderGRU(h, vocab_size).train()
160
- optimizer = optim.Adam(list(enc.parameters()) + list(dec.parameters()), lr=lr, amsgrad=True) # AMSGrad
 
161
 
162
- for i in range(epochs):
163
- print('epoch=%d, loss=%f' % (i, train_epoch(dataloader, enc, dec, optimizer)))
 
 
164
 
165
- dec.eval()
166
- for name in [detokenize(seq) for seq in dec(torch.randn(8,h), max_length=max_len)[0].topk(1)[1].squeeze().tolist()]:
167
  print(name)
168
- torch.save(dec, 'decoder.pt')
 
5
  import torch.nn as nn
6
  from torch import optim
7
  from torch.utils.data import DataLoader, Dataset
8
+ from torch.optim.lr_scheduler import StepLR
9
+ from tqdm.auto import tqdm
10
  import torch.nn.functional as F
11
  import pandas as pd
12
 
 
21
 
22
  texts = pd.read_csv('rolename.txt', header=None)[0].tolist()
23
  vocab_size=len(vocab)
24
+ h=192
25
+ h_latent=64
26
  max_len=40
27
  bs=128
28
+ lr=0.2
29
+ lr_step_size=32
30
+ lr_decay=0.5
31
+ momentum=0.9
32
+ epochs=192
33
+ grad_max_norm=1
34
 
35
  def tokenize(text):
36
  return [vocab_dict[ch] for ch in text]
 
57
  return mu*scale_mu, sigma*scale_sigma
58
 
59
  class EncoderVAEBiGRU(nn.Module):
60
+ def __init__(self, input_size, hidden_size, latent_size, dropout_p=0.1):
61
  super(EncoderVAEBiGRU, self).__init__()
62
  self.hidden_size = hidden_size
63
  self.embedding = nn.Embedding(input_size, hidden_size)
64
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True, bidirectional=True)
65
+ self.proj_mu = nn.Linear(4 * hidden_size, latent_size)
66
+ self.proj_sigma = nn.Linear(4 * hidden_size, latent_size)
67
  self.dropout = nn.Dropout(dropout_p)
68
+ self.bn = BatchNormVAE(latent_size)
69
 
70
  def forward(self, input, input_lengths):
71
  input_lengths = input_lengths.to('cpu')
 
83
  return eps * sigma + mu # var is sigma^2
84
 
85
  class DecoderGRU(nn.Module):
86
+ def __init__(self, latent_size, hidden_size, output_size):
87
  super(DecoderGRU, self).__init__()
88
+ self.proj1 = nn.Linear(latent_size, latent_size)
89
+ self.proj_activation = nn.ReLU()
90
+ self.proj2 = nn.Linear(latent_size, 2 * hidden_size)
91
  self.embedding = nn.Embedding(output_size, hidden_size)
92
  self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, batch_first=True)
93
  self.out = nn.Linear(hidden_size, output_size)
94
 
95
  def forward(self, encoder_sample, target_tensor=None, max_length=16):
96
  batch_size = encoder_sample.size(0)
97
+ decoder_hidden = self.proj1(encoder_sample)
98
+ decoder_hidden = self.proj_activation(decoder_hidden)
99
+ decoder_hidden = self.proj2(decoder_hidden)
100
+ decoder_hidden = decoder_hidden.view(batch_size, 2, -1).permute(1, 0, 2).contiguous()
101
  if target_tensor is not None:
102
  decoder_input = target_tensor
103
  decoder_outputs, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
 
148
  generator=torch.Generator(device='cuda'),
149
  )
150
 
151
+ def train_epoch(dataloader, encoder, decoder, optimizer, max_norm, norm_p=2):
152
  total_loss = 0
153
  nll = nn.NLLLoss()
154
  for enc_text, enc_len, input_text, target_text in dataloader:
 
162
  loss = loss_recons + loss_kld
163
  loss.backward()
164
 
165
+ # gradient clipping by norm
166
+ nn.utils.clip_grad_norm_(list(encoder.parameters()) + list(decoder.parameters()), max_norm, norm_type=norm_p)
167
+
168
  optimizer.step()
169
 
170
  total_loss += loss.item()
171
  return total_loss / len(dataloader)
172
 
173
+ encoder = EncoderVAEBiGRU(vocab_size, h, h_latent).train()
174
+ decoder = DecoderGRU(h_latent, h, vocab_size).train()
175
+ optimizer = optim.SGD(list(encoder.parameters()) + list(decoder.parameters()), lr=lr, momentum=momentum) # momentum
176
+ scheduler = StepLR(optimizer, step_size=lr_step_size, gamma=lr_decay)
177
 
178
+ with tqdm(range(epochs), desc='Training') as pbar:
179
+ for i in pbar:
180
+ pbar.set_postfix(loss=train_epoch(dataloader, encoder, decoder, optimizer, grad_max_norm))
181
+ scheduler.step()
182
 
183
+ decoder.eval()
184
+ for name in [detokenize(seq) for seq in decoder(torch.randn(8,h_latent), max_length=max_len)[0].topk(1)[1].squeeze().tolist()]:
185
  print(name)
186
+ torch.save(decoder, 'decoder.pt')