Uploaded config and code files
Browse files- config.json +26 -0
- configuration_ganbert.py +73 -0
- gan.py +76 -0
- ganbert.py +94 -0
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"GAN"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_ganbert.GanBertConfig",
|
7 |
+
"AutoModelForSequenceClassification": "ganbert.GAN"
|
8 |
+
},
|
9 |
+
"batch_size": 64,
|
10 |
+
"device": "cuda",
|
11 |
+
"epochs": 10,
|
12 |
+
"epsilon": 1e-08,
|
13 |
+
"learning_rate_discriminator": 1e-05,
|
14 |
+
"learning_rate_generator": 1e-05,
|
15 |
+
"model_number": -2,
|
16 |
+
"model_type": "ganbert",
|
17 |
+
"noise_size": 100,
|
18 |
+
"num_hidden_layers_d": 1,
|
19 |
+
"num_hidden_layers_g": 2,
|
20 |
+
"num_train_examples": 77450,
|
21 |
+
"out_dropout_rate": 0.4,
|
22 |
+
"pos_class_weight": 10,
|
23 |
+
"torch_dtype": "float32",
|
24 |
+
"transformers_version": "4.20.1",
|
25 |
+
"warmup_proportion": 0.1
|
26 |
+
}
|
configuration_ganbert.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
import logging
|
3 |
+
import datasets
|
4 |
+
from datasets import load_dataset
|
5 |
+
import pandas as pd
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
from datasets import load_metric
|
9 |
+
import transformers
|
10 |
+
import torch
|
11 |
+
import io
|
12 |
+
import torch.nn.functional as F
|
13 |
+
import random
|
14 |
+
import numpy as np
|
15 |
+
import time
|
16 |
+
import math
|
17 |
+
import datetime
|
18 |
+
import torch.nn as nn
|
19 |
+
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
20 |
+
from transformers import (
|
21 |
+
AutoModel,
|
22 |
+
AutoConfig,
|
23 |
+
AutoModelForSequenceClassification,
|
24 |
+
AutoTokenizer,
|
25 |
+
DataCollatorWithPadding,
|
26 |
+
default_data_collator,
|
27 |
+
set_seed,
|
28 |
+
get_constant_schedule_with_warmup,
|
29 |
+
Trainer,TrainingArguments,EarlyStoppingCallback)
|
30 |
+
from datasets import Dataset
|
31 |
+
import torch.nn as nn
|
32 |
+
import torch.nn.functional as F
|
33 |
+
import sys
|
34 |
+
|
35 |
+
class GanBertConfig(PretrainedConfig):
|
36 |
+
model_type = "ganbert"
|
37 |
+
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
out_dropout_rate = 0.4,
|
41 |
+
num_hidden_layers_g = 2,
|
42 |
+
num_hidden_layers_d = 1,
|
43 |
+
pos_class_weight = 10,
|
44 |
+
batch_size = 64,
|
45 |
+
noise_size = 100,
|
46 |
+
num_train_examples = 77450,
|
47 |
+
epochs = 10,
|
48 |
+
epsilon = 1e-08,
|
49 |
+
learning_rate_discriminator = 1e-05,
|
50 |
+
learning_rate_generator = 1e-05,
|
51 |
+
warmup_proportion= 0.1,
|
52 |
+
model_number = -2,
|
53 |
+
device ='cuda',
|
54 |
+
**kwargs,
|
55 |
+
):
|
56 |
+
self.out_dropout_rate=out_dropout_rate
|
57 |
+
self.num_hidden_layers_g=num_hidden_layers_g
|
58 |
+
self.num_hidden_layers_d=num_hidden_layers_d
|
59 |
+
self.pos_class_weight=pos_class_weight
|
60 |
+
self.model_number = model_number
|
61 |
+
self.learning_rate_discriminator=learning_rate_discriminator
|
62 |
+
self.learning_rate_generator=learning_rate_generator
|
63 |
+
self.warmup_proportion=warmup_proportion
|
64 |
+
self.epsilon=epsilon
|
65 |
+
self.num_train_examples=num_train_examples
|
66 |
+
self.epochs = epochs
|
67 |
+
self.batch_size=batch_size
|
68 |
+
self.noise_size = noise_size
|
69 |
+
if torch.cuda.is_available():
|
70 |
+
self.device = 'cuda'
|
71 |
+
else:
|
72 |
+
self.device = 'cpu'
|
73 |
+
super().__init__(**kwargs)
|
gan.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import datasets
|
3 |
+
from datasets import load_dataset
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
from datasets import load_metric
|
8 |
+
import transformers
|
9 |
+
import torch
|
10 |
+
import io
|
11 |
+
import torch.nn.functional as F
|
12 |
+
import random
|
13 |
+
import numpy as np
|
14 |
+
import time
|
15 |
+
import math
|
16 |
+
import datetime
|
17 |
+
import torch.nn as nn
|
18 |
+
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
19 |
+
from transformers import (
|
20 |
+
AutoModel,
|
21 |
+
AutoConfig,
|
22 |
+
AutoModelForSequenceClassification,
|
23 |
+
AutoTokenizer,
|
24 |
+
DataCollatorWithPadding,
|
25 |
+
default_data_collator,
|
26 |
+
set_seed,
|
27 |
+
get_constant_schedule_with_warmup,
|
28 |
+
Trainer,TrainingArguments,EarlyStoppingCallback)
|
29 |
+
from datasets import Dataset
|
30 |
+
import torch.nn as nn
|
31 |
+
import torch.nn.functional as F
|
32 |
+
import sys
|
33 |
+
#------------------------------
|
34 |
+
# The Generator as in
|
35 |
+
# https://www.aclweb.org/anthology/2020.acl-main.191/
|
36 |
+
# https://github.com/crux82/ganbert
|
37 |
+
#------------------------------
|
38 |
+
class Generator(nn.Module):
|
39 |
+
def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
|
40 |
+
super(Generator, self).__init__()
|
41 |
+
layers = []
|
42 |
+
hidden_sizes = [noise_size] + hidden_sizes
|
43 |
+
for i in range(len(hidden_sizes)-1):
|
44 |
+
layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])
|
45 |
+
|
46 |
+
layers.append(nn.Linear(hidden_sizes[-1],output_size))
|
47 |
+
self.layers = nn.Sequential(*layers)
|
48 |
+
|
49 |
+
def forward(self, noise):
|
50 |
+
output_rep = self.layers(noise)
|
51 |
+
return output_rep
|
52 |
+
|
53 |
+
#------------------------------
|
54 |
+
# The Discriminator
|
55 |
+
# https://www.aclweb.org/anthology/2020.acl-main.191/
|
56 |
+
# https://github.com/crux82/ganbert
|
57 |
+
#------------------------------
|
58 |
+
class Discriminator(nn.Module):
|
59 |
+
def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.3):
|
60 |
+
super(Discriminator, self).__init__()
|
61 |
+
self.input_dropout = nn.Dropout(p=dropout_rate)
|
62 |
+
layers = []
|
63 |
+
hidden_sizes = [input_size] + hidden_sizes
|
64 |
+
for i in range(len(hidden_sizes)-1):
|
65 |
+
layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])
|
66 |
+
|
67 |
+
self.layers = nn.Sequential(*layers) #per il flatten
|
68 |
+
self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
|
69 |
+
self.softmax = nn.Softmax(dim=-1)
|
70 |
+
|
71 |
+
def forward(self, input_rep):
|
72 |
+
input_rep = self.input_dropout(input_rep)
|
73 |
+
last_rep = self.layers(input_rep)
|
74 |
+
logits = self.logit(last_rep)
|
75 |
+
probs = self.softmax(logits)
|
76 |
+
return last_rep, logits, probs
|
ganbert.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedModel
|
2 |
+
from .configuration_ganbert import GanBertConfig
|
3 |
+
from .gan import Generator,Discriminator
|
4 |
+
from transformers import PretrainedConfig
|
5 |
+
import logging
|
6 |
+
import datasets
|
7 |
+
from datasets import load_dataset
|
8 |
+
import pandas as pd
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import seaborn as sns
|
11 |
+
from datasets import load_metric
|
12 |
+
import transformers
|
13 |
+
import torch
|
14 |
+
import io
|
15 |
+
import torch.nn.functional as F
|
16 |
+
import random
|
17 |
+
import numpy as np
|
18 |
+
import time
|
19 |
+
import math
|
20 |
+
import datetime
|
21 |
+
import torch.nn as nn
|
22 |
+
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
23 |
+
from transformers import (
|
24 |
+
AutoModel,
|
25 |
+
AutoConfig,
|
26 |
+
AutoModelForSequenceClassification,
|
27 |
+
AutoTokenizer,
|
28 |
+
DataCollatorWithPadding,
|
29 |
+
default_data_collator,
|
30 |
+
set_seed,
|
31 |
+
get_constant_schedule_with_warmup,
|
32 |
+
Trainer,TrainingArguments,EarlyStoppingCallback)
|
33 |
+
from datasets import Dataset
|
34 |
+
import torch.nn as nn
|
35 |
+
import torch.nn.functional as F
|
36 |
+
import sys
|
37 |
+
|
38 |
+
class GAN(PreTrainedModel):
|
39 |
+
config_class = GanBertConfig
|
40 |
+
all_checkpoints=['bert-base-multilingual-cased',
|
41 |
+
'sagorsarker/bangla-bert-base',
|
42 |
+
'neuralspace-reverie/indic-transformers-bn-bert',
|
43 |
+
'neuralspace-reverie/indic-transformers-bn-roberta',
|
44 |
+
'distilbert-base-multilingual-cased',
|
45 |
+
'neuralspace-reverie/indic-transformers-bn-distilbert',
|
46 |
+
'monsoon-nlp/bangla-electra',
|
47 |
+
'csebuetnlp/banglabert',
|
48 |
+
'neuralspace-reverie/indic-transformers-bn-xlmroberta'
|
49 |
+
]
|
50 |
+
def __init__(
|
51 |
+
self,
|
52 |
+
config
|
53 |
+
):
|
54 |
+
super().__init__(config)
|
55 |
+
|
56 |
+
self.model_name = self.all_checkpoints[config.model_number]
|
57 |
+
self.parent_config = AutoConfig.from_pretrained(self.model_name)
|
58 |
+
self.hidden_size = int(self.parent_config.hidden_size)
|
59 |
+
self.ns = config.noise_size
|
60 |
+
self.dv = config.device
|
61 |
+
# Define the number and width of hidden layers
|
62 |
+
self.hidden_levels_g = [self.hidden_size for i in range(0, config.num_hidden_layers_g)]
|
63 |
+
self.hidden_levels_d = [self.hidden_size for i in range(0, config.num_hidden_layers_d)]
|
64 |
+
self.label_list = [0,1,2]
|
65 |
+
self.class_weight = torch.tensor([10,config.pos_class_weight,5],device=config.device)
|
66 |
+
#-------------------------------------------------
|
67 |
+
# Instantiate the Generator and Discriminator
|
68 |
+
#-------------------------------------------------
|
69 |
+
self.generator = Generator(noise_size=config.noise_size, output_size=self.hidden_size, hidden_sizes=self.hidden_levels_g, dropout_rate=config.out_dropout_rate)
|
70 |
+
self.discriminator = Discriminator(input_size=self.hidden_size, hidden_sizes=self.hidden_levels_d,num_labels=len(self.label_list), dropout_rate=config.out_dropout_rate)
|
71 |
+
# Put everything in the GPU if available
|
72 |
+
# print(self.generator,self.discriminator)
|
73 |
+
self.transformer = AutoModel.from_pretrained(self.model_name,output_attentions=True)
|
74 |
+
|
75 |
+
if config.device == 'cuda':
|
76 |
+
self.generator.cuda()
|
77 |
+
self.discriminator.cuda()
|
78 |
+
self.transformer.cuda()
|
79 |
+
def forward(self,b_input_ids,b_input_mask):
|
80 |
+
# Encode real data in the Transformer
|
81 |
+
real_batch_size = b_input_ids.shape[0]
|
82 |
+
model_outputs = self.transformer(b_input_ids, attention_mask=b_input_mask)
|
83 |
+
# print('got transformer output')
|
84 |
+
|
85 |
+
hidden_states = torch.mean(model_outputs[0],dim=1)
|
86 |
+
noise = torch.zeros(real_batch_size, self.ns, device=self.dv).uniform_(0, 1).to(self.dv)
|
87 |
+
gen_rep = self.generator(noise)
|
88 |
+
disciminator_input = torch.cat([hidden_states, gen_rep], dim=0)
|
89 |
+
features, logits, probs = self.discriminator(disciminator_input)
|
90 |
+
return model_outputs[0]
|
91 |
+
|
92 |
+
if __name__ == '__main__':
|
93 |
+
ganconfig = GanBertConfig()
|
94 |
+
clickbaitmodel = GAN(ganconfig)
|