Motahar commited on
Commit
45b5843
1 Parent(s): 5b6ada5

Uploaded config and code files

Browse files
Files changed (4) hide show
  1. config.json +26 -0
  2. configuration_ganbert.py +73 -0
  3. gan.py +76 -0
  4. ganbert.py +94 -0
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GAN"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_ganbert.GanBertConfig",
7
+ "AutoModelForSequenceClassification": "ganbert.GAN"
8
+ },
9
+ "batch_size": 64,
10
+ "device": "cuda",
11
+ "epochs": 10,
12
+ "epsilon": 1e-08,
13
+ "learning_rate_discriminator": 1e-05,
14
+ "learning_rate_generator": 1e-05,
15
+ "model_number": -2,
16
+ "model_type": "ganbert",
17
+ "noise_size": 100,
18
+ "num_hidden_layers_d": 1,
19
+ "num_hidden_layers_g": 2,
20
+ "num_train_examples": 77450,
21
+ "out_dropout_rate": 0.4,
22
+ "pos_class_weight": 10,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.20.1",
25
+ "warmup_proportion": 0.1
26
+ }
configuration_ganbert.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ import logging
3
+ import datasets
4
+ from datasets import load_dataset
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from datasets import load_metric
9
+ import transformers
10
+ import torch
11
+ import io
12
+ import torch.nn.functional as F
13
+ import random
14
+ import numpy as np
15
+ import time
16
+ import math
17
+ import datetime
18
+ import torch.nn as nn
19
+ from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
20
+ from transformers import (
21
+ AutoModel,
22
+ AutoConfig,
23
+ AutoModelForSequenceClassification,
24
+ AutoTokenizer,
25
+ DataCollatorWithPadding,
26
+ default_data_collator,
27
+ set_seed,
28
+ get_constant_schedule_with_warmup,
29
+ Trainer,TrainingArguments,EarlyStoppingCallback)
30
+ from datasets import Dataset
31
+ import torch.nn as nn
32
+ import torch.nn.functional as F
33
+ import sys
34
+
35
+ class GanBertConfig(PretrainedConfig):
36
+ model_type = "ganbert"
37
+
38
+ def __init__(
39
+ self,
40
+ out_dropout_rate = 0.4,
41
+ num_hidden_layers_g = 2,
42
+ num_hidden_layers_d = 1,
43
+ pos_class_weight = 10,
44
+ batch_size = 64,
45
+ noise_size = 100,
46
+ num_train_examples = 77450,
47
+ epochs = 10,
48
+ epsilon = 1e-08,
49
+ learning_rate_discriminator = 1e-05,
50
+ learning_rate_generator = 1e-05,
51
+ warmup_proportion= 0.1,
52
+ model_number = -2,
53
+ device ='cuda',
54
+ **kwargs,
55
+ ):
56
+ self.out_dropout_rate=out_dropout_rate
57
+ self.num_hidden_layers_g=num_hidden_layers_g
58
+ self.num_hidden_layers_d=num_hidden_layers_d
59
+ self.pos_class_weight=pos_class_weight
60
+ self.model_number = model_number
61
+ self.learning_rate_discriminator=learning_rate_discriminator
62
+ self.learning_rate_generator=learning_rate_generator
63
+ self.warmup_proportion=warmup_proportion
64
+ self.epsilon=epsilon
65
+ self.num_train_examples=num_train_examples
66
+ self.epochs = epochs
67
+ self.batch_size=batch_size
68
+ self.noise_size = noise_size
69
+ if torch.cuda.is_available():
70
+ self.device = 'cuda'
71
+ else:
72
+ self.device = 'cpu'
73
+ super().__init__(**kwargs)
gan.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import datasets
3
+ from datasets import load_dataset
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from datasets import load_metric
8
+ import transformers
9
+ import torch
10
+ import io
11
+ import torch.nn.functional as F
12
+ import random
13
+ import numpy as np
14
+ import time
15
+ import math
16
+ import datetime
17
+ import torch.nn as nn
18
+ from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
19
+ from transformers import (
20
+ AutoModel,
21
+ AutoConfig,
22
+ AutoModelForSequenceClassification,
23
+ AutoTokenizer,
24
+ DataCollatorWithPadding,
25
+ default_data_collator,
26
+ set_seed,
27
+ get_constant_schedule_with_warmup,
28
+ Trainer,TrainingArguments,EarlyStoppingCallback)
29
+ from datasets import Dataset
30
+ import torch.nn as nn
31
+ import torch.nn.functional as F
32
+ import sys
33
+ #------------------------------
34
+ # The Generator as in
35
+ # https://www.aclweb.org/anthology/2020.acl-main.191/
36
+ # https://github.com/crux82/ganbert
37
+ #------------------------------
38
+ class Generator(nn.Module):
39
+ def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
40
+ super(Generator, self).__init__()
41
+ layers = []
42
+ hidden_sizes = [noise_size] + hidden_sizes
43
+ for i in range(len(hidden_sizes)-1):
44
+ layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])
45
+
46
+ layers.append(nn.Linear(hidden_sizes[-1],output_size))
47
+ self.layers = nn.Sequential(*layers)
48
+
49
+ def forward(self, noise):
50
+ output_rep = self.layers(noise)
51
+ return output_rep
52
+
53
+ #------------------------------
54
+ # The Discriminator
55
+ # https://www.aclweb.org/anthology/2020.acl-main.191/
56
+ # https://github.com/crux82/ganbert
57
+ #------------------------------
58
+ class Discriminator(nn.Module):
59
+ def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.3):
60
+ super(Discriminator, self).__init__()
61
+ self.input_dropout = nn.Dropout(p=dropout_rate)
62
+ layers = []
63
+ hidden_sizes = [input_size] + hidden_sizes
64
+ for i in range(len(hidden_sizes)-1):
65
+ layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])
66
+
67
+ self.layers = nn.Sequential(*layers) #per il flatten
68
+ self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
69
+ self.softmax = nn.Softmax(dim=-1)
70
+
71
+ def forward(self, input_rep):
72
+ input_rep = self.input_dropout(input_rep)
73
+ last_rep = self.layers(input_rep)
74
+ logits = self.logit(last_rep)
75
+ probs = self.softmax(logits)
76
+ return last_rep, logits, probs
ganbert.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel
2
+ from .configuration_ganbert import GanBertConfig
3
+ from .gan import Generator,Discriminator
4
+ from transformers import PretrainedConfig
5
+ import logging
6
+ import datasets
7
+ from datasets import load_dataset
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ from datasets import load_metric
12
+ import transformers
13
+ import torch
14
+ import io
15
+ import torch.nn.functional as F
16
+ import random
17
+ import numpy as np
18
+ import time
19
+ import math
20
+ import datetime
21
+ import torch.nn as nn
22
+ from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
23
+ from transformers import (
24
+ AutoModel,
25
+ AutoConfig,
26
+ AutoModelForSequenceClassification,
27
+ AutoTokenizer,
28
+ DataCollatorWithPadding,
29
+ default_data_collator,
30
+ set_seed,
31
+ get_constant_schedule_with_warmup,
32
+ Trainer,TrainingArguments,EarlyStoppingCallback)
33
+ from datasets import Dataset
34
+ import torch.nn as nn
35
+ import torch.nn.functional as F
36
+ import sys
37
+
38
+ class GAN(PreTrainedModel):
39
+ config_class = GanBertConfig
40
+ all_checkpoints=['bert-base-multilingual-cased',
41
+ 'sagorsarker/bangla-bert-base',
42
+ 'neuralspace-reverie/indic-transformers-bn-bert',
43
+ 'neuralspace-reverie/indic-transformers-bn-roberta',
44
+ 'distilbert-base-multilingual-cased',
45
+ 'neuralspace-reverie/indic-transformers-bn-distilbert',
46
+ 'monsoon-nlp/bangla-electra',
47
+ 'csebuetnlp/banglabert',
48
+ 'neuralspace-reverie/indic-transformers-bn-xlmroberta'
49
+ ]
50
+ def __init__(
51
+ self,
52
+ config
53
+ ):
54
+ super().__init__(config)
55
+
56
+ self.model_name = self.all_checkpoints[config.model_number]
57
+ self.parent_config = AutoConfig.from_pretrained(self.model_name)
58
+ self.hidden_size = int(self.parent_config.hidden_size)
59
+ self.ns = config.noise_size
60
+ self.dv = config.device
61
+ # Define the number and width of hidden layers
62
+ self.hidden_levels_g = [self.hidden_size for i in range(0, config.num_hidden_layers_g)]
63
+ self.hidden_levels_d = [self.hidden_size for i in range(0, config.num_hidden_layers_d)]
64
+ self.label_list = [0,1,2]
65
+ self.class_weight = torch.tensor([10,config.pos_class_weight,5],device=config.device)
66
+ #-------------------------------------------------
67
+ # Instantiate the Generator and Discriminator
68
+ #-------------------------------------------------
69
+ self.generator = Generator(noise_size=config.noise_size, output_size=self.hidden_size, hidden_sizes=self.hidden_levels_g, dropout_rate=config.out_dropout_rate)
70
+ self.discriminator = Discriminator(input_size=self.hidden_size, hidden_sizes=self.hidden_levels_d,num_labels=len(self.label_list), dropout_rate=config.out_dropout_rate)
71
+ # Put everything in the GPU if available
72
+ # print(self.generator,self.discriminator)
73
+ self.transformer = AutoModel.from_pretrained(self.model_name,output_attentions=True)
74
+
75
+ if config.device == 'cuda':
76
+ self.generator.cuda()
77
+ self.discriminator.cuda()
78
+ self.transformer.cuda()
79
+ def forward(self,b_input_ids,b_input_mask):
80
+ # Encode real data in the Transformer
81
+ real_batch_size = b_input_ids.shape[0]
82
+ model_outputs = self.transformer(b_input_ids, attention_mask=b_input_mask)
83
+ # print('got transformer output')
84
+
85
+ hidden_states = torch.mean(model_outputs[0],dim=1)
86
+ noise = torch.zeros(real_batch_size, self.ns, device=self.dv).uniform_(0, 1).to(self.dv)
87
+ gen_rep = self.generator(noise)
88
+ disciminator_input = torch.cat([hidden_states, gen_rep], dim=0)
89
+ features, logits, probs = self.discriminator(disciminator_input)
90
+ return model_outputs[0]
91
+
92
+ if __name__ == '__main__':
93
+ ganconfig = GanBertConfig()
94
+ clickbaitmodel = GAN(ganconfig)