meow commited on
Commit
ab2adfb
1 Parent(s): 85a9328

Add application file

Browse files
Files changed (10) hide show
  1. .gitignore +146 -0
  2. README.md +1 -12
  3. app.py +103 -0
  4. app_local.py +125 -0
  5. lstm_model_new.py +193 -0
  6. max_ent_model.py +139 -0
  7. pre-requirements.txt +4 -0
  8. requirements.txt +27 -0
  9. svm_model.py +210 -0
  10. trainer.py +358 -0
.gitignore ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ *.csv
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ *.npy
39
+ *.pth
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # /root/diffsim/Yelp-Review-Sentiment-Analysis/yelp_review_polarity_csv
46
+ ./yelp_review_polarity_csv/*
47
+ # /root/diffsim/Yelp-Review-Sentiment-Analysis/preprocessed_data
48
+ ./preprocessed_data/*
49
+
50
+ */*.npy
51
+ */*.csv
52
+ */*.zip
53
+ */*.txt
54
+
55
+ */*.model
56
+
57
+ # Unit test / coverage reports
58
+ htmlcov/
59
+ .tox/
60
+ .nox/
61
+ .coverage
62
+ .coverage.*
63
+ .cache
64
+ nosetests.xml
65
+ coverage.xml
66
+ *.cover
67
+ *.py,cover
68
+ .hypothesis/
69
+ .pytest_cache/
70
+
71
+ # Translations
72
+ *.mo
73
+ *.pot
74
+
75
+ # Django stuff:
76
+ *.log
77
+ local_settings.py
78
+ db.sqlite3
79
+ db.sqlite3-journal
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/_build/
90
+
91
+ # PyBuilder
92
+ target/
93
+
94
+ # Jupyter Notebook
95
+ .ipynb_checkpoints
96
+
97
+ # IPython
98
+ profile_default/
99
+ ipython_config.py
100
+
101
+ # pyenv
102
+ .python-version
103
+
104
+ # pipenv
105
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
107
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
108
+ # install all needed dependencies.
109
+ #Pipfile.lock
110
+
111
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112
+ __pypackages__/
113
+
114
+ # Celery stuff
115
+ celerybeat-schedule
116
+ celerybeat.pid
117
+
118
+ # SageMath parsed files
119
+ *.sage.py
120
+
121
+ # Environments
122
+ .env
123
+ .venv
124
+ env/
125
+ venv/
126
+ ENV/
127
+ env.bak/
128
+ venv.bak/
129
+
130
+ # Spyder project settings
131
+ .spyderproject
132
+ .spyproject
133
+
134
+ # Rope project settings
135
+ .ropeproject
136
+
137
+ # mkdocs documentation
138
+ /site
139
+
140
+ # mypy
141
+ .mypy_cache/
142
+ .dmypy.json
143
+ dmypy.json
144
+
145
+ # Pyre type checker
146
+ .pyre/
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Text Classification
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.32.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Text Classification
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import gradio as gr
4
+
5
+
6
+ import os
7
+
8
+ import tempfile
9
+
10
+ import shutil
11
+
12
+ from trainer import Trainer
13
+
14
+ def predict(input_text):
15
+ predicted_label = trainer.predict(input_text )
16
+ return str(predicted_label)
17
+ # pass
18
+
19
+ def predict_maxent(input_text):
20
+ predicted_label = trainer_maxent.predict_maxent(input_text )
21
+ return str(predicted_label)
22
+ # pass
23
+
24
+ def predict_svm(input_text):
25
+ predicted_label = trainer_svm.predict_svm(input_text )
26
+ return str(predicted_label)
27
+ # pass
28
+
29
+
30
+ def create_demo():
31
+
32
+ USAGE = """## Text Classification
33
+
34
+ """
35
+
36
+
37
+ with gr.Blocks() as demo:
38
+
39
+ gr.Markdown(USAGE)
40
+
41
+ # demo =
42
+ # gr.Interface(
43
+ # predict,
44
+ # # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
45
+ # gr.File(type="filepath"),
46
+ # gr.File(type="filepath"),
47
+ # cache_examples=False
48
+ # )
49
+
50
+ # input_file = gr.File(type="filepath")
51
+ # output_file = gr.File(type="filepath")
52
+
53
+ gr.Interface(fn=predict, inputs="textbox", outputs="textbox")
54
+
55
+ gr.Interface(fn=predict_maxent, inputs="textbox", outputs="textbox")
56
+
57
+ gr.Interface(fn=predict_svm, inputs="textbox", outputs="textbox")
58
+
59
+ # gr.Interface(
60
+ # predict,
61
+ # # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
62
+ # input_file,
63
+ # output_file,
64
+ # cache_examples=False
65
+ # )
66
+
67
+ # inputs = input_file
68
+ # outputs = output_file
69
+ # gr.Examples(
70
+ # examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
71
+ # inputs=inputs,
72
+ # fn=predict,
73
+ # outputs=outputs,
74
+ # )
75
+
76
+
77
+ return demo
78
+
79
+ if __name__ == "__main__":
80
+
81
+ vocab_size = 8000
82
+ sequence_len = 150
83
+
84
+ # batch_size = 1024
85
+ batch_size = 256
86
+ nn_epochs = 20
87
+ model_type = "lstm"
88
+
89
+ # model_type = "bilstm"
90
+
91
+ trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
92
+
93
+ model_type = "max_ent"
94
+ trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
95
+
96
+
97
+ model_type = "svm"
98
+ trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
99
+
100
+
101
+
102
+ demo = create_demo()
103
+ demo.launch()
app_local.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # import gradio as gr
4
+
5
+
6
+ import os
7
+
8
+ import tempfile
9
+
10
+ import shutil
11
+
12
+ from trainer import Trainer
13
+
14
+ def predict(input_text, model_type):
15
+ if model_type in ['lstm', 'bilstm']:
16
+ predicted_label = trainer.predict(input_text )
17
+ elif model_type == 'max_ent':
18
+ predicted_label = trainer.predict_maxent(input_text)
19
+ elif model_type == 'svm':
20
+ predicted_label = trainer.predict_svm(input_text)
21
+
22
+ return str(predicted_label)
23
+ # pass
24
+
25
+ def predict_omni(input_text, model_type):
26
+ predicted_label_net = trainer.predict(input_text )
27
+ predicted_label_maxent = trainer_maxent.predict_maxent(input_text )
28
+ predicted_label_svm = trainer_svm.predict_svm(input_text )
29
+ # if model_type in ['lstm', 'bilstm']:
30
+ # predicted_label = trainer.predict(input_text )
31
+ # elif model_type == 'max_ent':
32
+ # predicted_label = trainer.predict_maxent(input_text)
33
+ # elif model_type == 'svm':
34
+ # predicted_label = trainer.predict_svm(input_text)
35
+ predicted_text = f"LSTM: {predicted_label_net}, Max Ent: {predicted_label_maxent}, SVM: {predicted_label_svm}"
36
+ return predicted_text
37
+ # pass
38
+
39
+
40
+ def create_demo():
41
+
42
+ USAGE = """## Text Classification
43
+
44
+ """
45
+
46
+
47
+ with gr.Blocks() as demo:
48
+
49
+ gr.Markdown(USAGE)
50
+
51
+ # demo =
52
+ # gr.Interface(
53
+ # predict,
54
+ # # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
55
+ # gr.File(type="filepath"),
56
+ # gr.File(type="filepath"),
57
+ # cache_examples=False
58
+ # )
59
+
60
+ input_file = gr.File(type="filepath")
61
+ output_file = gr.File(type="filepath")
62
+
63
+ gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
64
+
65
+ # gr.Interface(
66
+ # predict,
67
+ # # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
68
+ # input_file,
69
+ # output_file,
70
+ # cache_examples=False
71
+ # )
72
+
73
+ # inputs = input_file
74
+ # outputs = output_file
75
+ # gr.Examples(
76
+ # examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
77
+ # inputs=inputs,
78
+ # fn=predict,
79
+ # outputs=outputs,
80
+ # )
81
+
82
+
83
+ return demo
84
+
85
+ if __name__ == "__main__":
86
+
87
+ vocab_size = 8000
88
+ sequence_len = 150
89
+
90
+ # batch_size = 1024
91
+ batch_size = 256
92
+ nn_epochs = 20
93
+ model_type = "lstm"
94
+
95
+ # model_type = "bilstm"
96
+
97
+ # model_type = "max_ent"
98
+
99
+ # trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
100
+ # print(f"Trainer loaded")
101
+
102
+
103
+ model_type = "lstm"
104
+
105
+ trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
106
+
107
+ model_type = "max_ent"
108
+ trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
109
+
110
+ model_type = "svm"
111
+ trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
112
+
113
+
114
+ while True:
115
+ input_text = input()
116
+ # if model_type in ["lstm", "bilstm"]:
117
+ # label = predict(input_text, model_type)
118
+ label = predict_omni(input_text, model_type)
119
+ # elif model_type in ["max_ent"]:
120
+ # label =
121
+ print(label)
122
+
123
+ # demo = create_demo()
124
+ # demo.launch()
125
+ # python app_local.py
lstm_model_new.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.optim as optim
6
+ from torch.autograd import Variable
7
+ import torch.distributed as dist
8
+
9
+ import math
10
+
11
+
12
+ class LSTMCell(nn.Module):
13
+
14
+ def __init__(self, input_size, hidden_size, bias=True):
15
+ super(LSTMCell, self).__init__()
16
+ self.input_size = input_size
17
+ self.hidden_size = hidden_size
18
+ self.bias = bias
19
+ self.x2h = nn.Linear(input_size, 4 * hidden_size, bias=bias)
20
+ self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
21
+ self.reset_parameters()
22
+
23
+
24
+
25
+ def reset_parameters(self):
26
+ std = 1.0 / math.sqrt(self.hidden_size)
27
+ for w in self.parameters():
28
+ w.data.uniform_(-std, std)
29
+
30
+ def forward(self, x, hidden):
31
+
32
+ hx, cx = hidden
33
+
34
+ x = x.view(-1, x.size(1))
35
+
36
+ gates = self.x2h(x) + self.h2h(hx)
37
+
38
+ # print(f"gates: {gates.shape}")
39
+
40
+ # gates = gates.squeeze()
41
+
42
+
43
+
44
+ ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
45
+
46
+ ingate = F.sigmoid(ingate)
47
+ forgetgate = F.sigmoid(forgetgate)
48
+ cellgate = F.tanh(cellgate)
49
+ outgate = F.sigmoid(outgate)
50
+
51
+
52
+ cy = torch.mul(cx, forgetgate) + torch.mul(ingate, cellgate)
53
+
54
+ hy = torch.mul(outgate, F.tanh(cy))
55
+
56
+ return (hy, cy)
57
+
58
+ class LSTMModel(nn.Module):
59
+ def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
60
+ super(LSTMModel, self).__init__()
61
+ # Hidden dimensions
62
+ self.hidden_dim = hidden_dim
63
+
64
+ # Number of hidden layers
65
+ self.layer_dim = layer_dim
66
+
67
+ self.lstm = LSTMCell(input_dim, hidden_dim, layer_dim)
68
+
69
+ self.fc = nn.Linear(hidden_dim, output_dim)
70
+
71
+
72
+
73
+ def forward(self, x):
74
+
75
+ # Initialize hidden state with zeros
76
+ #######################
77
+ # USE GPU FOR MODEL #
78
+ #######################
79
+ #print(x.shape,"x.shape")100, 28, 28
80
+ if torch.cuda.is_available():
81
+ h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
82
+ else:
83
+ h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
84
+
85
+ # Initialize cell state
86
+ if torch.cuda.is_available():
87
+ c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
88
+ else:
89
+ c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
90
+
91
+
92
+
93
+ outs = []
94
+
95
+ cn = c0[0,:,:]
96
+ hn = h0[0,:,:]
97
+
98
+ for seq in range(x.size(1)):
99
+ hn, cn = self.lstm(x[:,seq,:], (hn,cn))
100
+ outs.append(hn)
101
+
102
+
103
+
104
+ out = outs[-1] # .squeeze()
105
+
106
+ out = self.fc(out)
107
+ # out.size() --> 100, 10
108
+ return out
109
+
110
+
111
+ class LSTM_model(nn.Module):
112
+ def __init__(self, vocab_size, n_hidden):
113
+ super(LSTM_model, self).__init__()
114
+
115
+ self.embedding = nn.Embedding(vocab_size, n_hidden)
116
+
117
+
118
+ self.lstm = LSTMModel(n_hidden, n_hidden, n_hidden, n_hidden)
119
+ self.fc_output = nn.Linear(n_hidden, 1)
120
+
121
+
122
+ self.loss = nn.BCEWithLogitsLoss()
123
+
124
+ def forward(self, X, t, train=True):
125
+
126
+ embed = self.embedding(X) # batch_size, time_steps, features
127
+ no_of_timesteps = embed.shape[1]
128
+ n_hidden = embed.shape[2]
129
+
130
+ input = embed
131
+
132
+ # print(f"input: {input.shape}")
133
+
134
+ fc_out = self.lstm(input) ## bsz x nnhidden_dim
135
+
136
+ # print(f"fc_out: {fc_out.size()}")
137
+ h = self.fc_output(fc_out)
138
+ # print(f"h: {h.size()}")
139
+
140
+ return self.loss(h[:, 0], t), h[:, 0]
141
+
142
+ class BiLSTM(nn.Module):
143
+ def __init__(self, input_size, hidden_size, bias=True):
144
+ super(BiLSTM, self).__init__()
145
+ self.forward_cell = LSTMCell(input_size, hidden_size, bias)
146
+ self.backward_cell = LSTMCell(input_size, hidden_size, bias)
147
+
148
+ def forward(self, input_seq):
149
+ forward_outputs = []
150
+ backward_outputs = []
151
+
152
+ forward_hidden = (torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device),
153
+ torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device))
154
+ backward_hidden = (torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device),
155
+ torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device))
156
+
157
+ for t in range(input_seq.size(1)):
158
+ forward_hidden = self.forward_cell(input_seq[:, t], forward_hidden)
159
+ forward_outputs.append(forward_hidden[0])
160
+
161
+ for t in range(input_seq.size(1)-1, -1, -1):
162
+ backward_hidden = self.backward_cell(input_seq[:, t], backward_hidden)
163
+ backward_outputs.append(backward_hidden[0])
164
+
165
+ forward_outputs = torch.stack(forward_outputs, dim=1)
166
+ backward_outputs = torch.stack(backward_outputs, dim=1)
167
+
168
+ outputs = torch.cat((forward_outputs, backward_outputs), dim=2)
169
+
170
+ return outputs
171
+
172
+ class BiLSTMModel(nn.Module):
173
+ def __init__(self, vocab_size, n_hidden):
174
+ super(BiLSTMModel, self).__init__()
175
+
176
+ self.embedding = nn.Embedding(vocab_size, n_hidden)
177
+ self.bilstm = BiLSTM(n_hidden, n_hidden)
178
+ self.fc_output = nn.Linear(2*n_hidden, 1)
179
+ self.loss = nn.BCEWithLogitsLoss()
180
+
181
+ def forward(self, X, t, train=True):
182
+ embed = self.embedding(X) # batch_size, time_steps, features
183
+ no_of_timesteps = embed.shape[1]
184
+ n_hidden = embed.shape[2]
185
+
186
+ input = embed
187
+ bilstm_out = self.bilstm(input) ## bsz x nnhidden_dim
188
+ bilstm_out = bilstm_out[:, -1, :]
189
+ h = self.fc_output(bilstm_out)
190
+ # print(f"bilstm_out: {bilstm_out.shape}, h: {h.shape}, t: {t.shape}")
191
+ return self.loss(h[:,0], t), h[:, 0]
192
+
193
+
max_ent_model.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+
4
+
5
+ import math
6
+
7
+
8
+
9
+
10
+
11
+ class MaxEntropyModel:
12
+
13
+ def __init__(self, ):
14
+ self.train_set = []
15
+ self.features = {}
16
+ self.labels = {}
17
+ self.labels = {
18
+ '1': 1, '2': 1
19
+ }
20
+
21
+ def load_data(self, fn):
22
+ with open(fn, "r") as rf:
23
+ for line in rf:
24
+ label, review = line.strip().split(',')
25
+ label = label[1: -1]
26
+ review = review.split(' ')
27
+ fields = [str(int(label))] + review
28
+ if review != '':
29
+ label = str(int(label))
30
+ self.labels[label] = 1
31
+ for s in set(fields[1:]):
32
+ if (label, s) not in self.features:
33
+ self.features[(label, s)] = 1
34
+ else:
35
+ self.features[(label, s)] += 1
36
+ self.train_set.append(fields)
37
+ rf.close()
38
+
39
+ def initialize_parameters(self, ):
40
+ self.train_set_size = len(self.train_set)
41
+ self.M = max([len(record)-1 for record in self.train_set])
42
+ self.ep = [0.0 for _ in range(len(self.features))]
43
+
44
+ for i_f, feat in enumerate(self.features):
45
+ self.ep[i_f] = float(self.features[feat]) / float(self.train_set_size)
46
+ self.features[feat] = i_f
47
+
48
+ self.weights = [0.0 for _ in range(len(self.features))]
49
+ self.last_weights = self.weights
50
+
51
+
52
+
53
+ def get_prob_weight(self, features, label):
54
+ weight = 0.0
55
+ for feat in features:
56
+ # print(label, feat)
57
+ if (label, feat) in self.features:
58
+ weight += self.weights[self.features[(label, feat)]]
59
+ prob_weight = math.exp(weight)
60
+ # print(f"label: {label}, prob_weight: {prob_weight}")
61
+ return prob_weight
62
+
63
+ def get_expected_features(self, ):
64
+ expected_features = [0.0 for _ in range(len(self.features))]
65
+ for record in self.train_set:
66
+ features = record[1:]
67
+ prob = self.calculate_probability(features)
68
+ for feat in features:
69
+ for w, l in prob:
70
+ if (l, feat) in self.features:
71
+ idx = self.features[(l, feat)]
72
+ expected_features[idx] += w * (1.0 / self.train_set_size)
73
+ return expected_features
74
+
75
+ def calculate_probability(self, features):
76
+ weights = [(self.get_prob_weight(features, l), l) for l in self.labels]
77
+ tot_weights = [w for w, l in weights]
78
+
79
+ Z = sum(tot_weights)
80
+
81
+ prob = [(w / Z, l) for w, l in weights]
82
+ return prob
83
+
84
+ def train(self, max_iter=10000):
85
+ self.initialize_parameters()
86
+ for i in range(max_iter):
87
+ print(f"[Training] iter {i + 1} ...")
88
+ self.new_ep = self.get_expected_features()
89
+ self.last_weights = self.weights[:]
90
+ for i, w in enumerate(self.weights):
91
+ delta = 1.0 / self.M * math.log(self.ep[i] / self.new_ep[i])
92
+ self.weights[i] = self.weights[i] + delta
93
+ if i % 10 == 0:
94
+ test_data_path = "../preprocessed_data/yelp_test.txt"
95
+ print(f"Start testing...")
96
+ self.test(test_data_path)
97
+
98
+ def test(self, test_data_path):
99
+ f = open(file=test_data_path)
100
+ tot_test_nn = 0
101
+ correct_test_nn = 0
102
+ for line in f:
103
+ label, review = line.strip().split(',')
104
+ label = label[1: -1]
105
+ review = review.split(' ')
106
+
107
+ # fields = [str(int(label))] + review ## get split review ## #
108
+
109
+ # input text: review #
110
+ # output: label #
111
+ # review #
112
+
113
+ prob = self.calculate_probability(review)
114
+ prob.sort(reverse=True)
115
+ print(label, prob)
116
+
117
+ ##### Calculate whether the prediction is correct #####
118
+ maxx_prob_idx = int(prob[0][1])
119
+ label_idx = int(label)
120
+ if maxx_prob_idx == label_idx:
121
+ correct_test_nn += 1
122
+ tot_test_nn += 1
123
+ ##### Calculate whether the prediction is correct #####
124
+
125
+ f.close()
126
+ acc = float(correct_test_nn) / float(tot_test_nn)
127
+ print(f"[Test] Acc: {acc}")
128
+
129
+ def save_ckpt(self, sv_ckpt_path):
130
+ sv_features = self.features
131
+ sv_weights = self.last_weights
132
+ sv_ckpt = {
133
+ 'features': sv_features,
134
+ 'weights': sv_weights
135
+ }
136
+ np.save(sv_ckpt_path, sv_ckpt)
137
+ print(f"ckpt with features and weights saved to {sv_ckpt_path}")
138
+
139
+
pre-requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # pip==23.3.2
2
+ # torch==2.2.0
3
+ -i https://download.pytorch.org/whl/cpu
4
+ torch==2.2.0
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ # -f https://data.pyg.org/whl/torch-2.2.0%2Bcpu.html
3
+ # -i https://download.pytorch.org/whl/cpu
4
+ # pip==20.2.4
5
+ # torch==2.2.0
6
+ # torchvision==0.13.1
7
+ # torchaudio==0.12.1
8
+
9
+ tqdm
10
+ nltk
11
+ scikit-learn
12
+ scipy
13
+
14
+
15
+ # blobfile==2.0.1
16
+ # manopth @ git+https://github.com/hassony2/manopth.git
17
+ # numpy==1.23.1
18
+ # psutil==5.9.2
19
+ # scikit-learn
20
+ # scipy==1.9.3
21
+ # tensorboard
22
+ # tensorboardx
23
+ # tqdm
24
+ # trimesh
25
+ # clip
26
+ # chumpy
27
+ # opencv-python
svm_model.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import re
3
+ import time
4
+ from nltk.corpus import stopwords
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ # from sklearn.linear_model import LogisticRegression
7
+ # from sklearn.svm import SVC
8
+ import ssl
9
+ import os
10
+ import nltk
11
+ try:
12
+ _create_unverified_https_context = ssl._create_unverified_context
13
+ except AttributeError:
14
+ pass
15
+ else:
16
+ ssl._create_default_https_context = _create_unverified_https_context
17
+ # print(f"nltk version: {nltk.__version__}")
18
+ # nltk.download('stopwords')
19
+
20
+ #
21
+ class SVMModel:
22
+ def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=1000):
23
+ self.learning_rate = learning_rate
24
+ self.lambda_param = lambda_param
25
+ self.n_iters = n_iters
26
+ self.w = None
27
+ self.b = None
28
+
29
+ self.X_train = None
30
+ self.X_test = None
31
+ self.y_train = None
32
+ self.y_test = None
33
+
34
+ def fit(self, X, y):
35
+ n_samples, n_features = X.shape
36
+ y_ = np.where(y <= 0, -1, 1) # Convert labels to -1 and 1
37
+
38
+ print(f"y_ max: {np.max(y_)}, y_ min: {np.min(y_)}")
39
+
40
+ self.w = np.zeros(n_features)
41
+ self.b = 0
42
+
43
+ self.lambda_param = 1.0 / float(n_samples)
44
+
45
+ for _ in range(self.n_iters):
46
+ print(f"Epoch: {_}")
47
+ for idx, x_i in enumerate(X):
48
+ condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
49
+ if condition:
50
+ self.w = self.w - self.learning_rate * (2 * self.lambda_param * self.w)
51
+ else:
52
+ self.w = self.w - self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
53
+ self.b = self.b - self.learning_rate * y_[idx]
54
+ if _ % 1 == 0:
55
+ # print(f"Iteration: {_}")
56
+ st_time = time.time()
57
+ self.test()
58
+ print(f"Time: {time.time() - st_time}")
59
+
60
+ def predict(self, X):
61
+
62
+ linear_output = np.matmul(X, self.w[:, None]) - self.b # []
63
+ return np.sign(linear_output[:, 0])
64
+
65
+ def test(self, ):
66
+ # test_ours(self, ):
67
+ linear_output = self.predict(self.X_test)
68
+ print(f"linear_output: {linear_output.shape}, self.X_test: {self.X_test.shape}")
69
+ acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
70
+ print(f"Test Acc: {acc}")
71
+ return linear_output
72
+
73
+ # weights_dict = self.svm_model.get_weights_dict()
74
+ def get_weights_dict(self, ):
75
+ weights_dict = {
76
+ 'w': self.w,
77
+ 'b': self.b
78
+ }
79
+ return weights_dict
80
+
81
+ class SVM:
82
+ def __init__(self, ):
83
+ # file_path =
84
+ self.x_train = []
85
+ self.y_train = []
86
+ self.x_test = []
87
+ self.y_test = []
88
+
89
+ self.data_folder = '.'
90
+
91
+ print(f"Start loading data")
92
+ self._load_data()
93
+
94
+ print(f"Setting vectorizer")
95
+ self.vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
96
+
97
+ print(f"Start preprocessing data")
98
+ self._preprocess_data()
99
+
100
+ # self.setup_model()
101
+ self.setup_model_ours()
102
+
103
+ pass
104
+
105
+ def _load_data(self, ):
106
+
107
+ file_path = '.'
108
+ x_train = []
109
+ y_train = []
110
+ with open(os.path.join(self.data_folder, 'train.csv'), "r") as f:
111
+ for line in f:
112
+ l = line.strip().split(',')
113
+ senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
114
+ x_train.append(text)
115
+ y_train.append(int(senti[1]) - 1)
116
+ f.close()
117
+
118
+ x_test = []
119
+ y_test = []
120
+ with open(os.path.join(self.data_folder, 'test.csv'), "r") as f:
121
+ for line in f:
122
+ l = line.strip().split(',')
123
+ senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
124
+ x_test.append(text)
125
+ y_test.append(int(senti[1]) - 1)
126
+ f.close()
127
+ self.x_train = x_train
128
+ self.x_test = x_test
129
+ self.y_train = np.array(y_train, dtype=np.int32)
130
+ self.y_test = np.array(y_test, dtype=np.int32)
131
+ print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
132
+
133
+ def _preprocess_data(self, ):
134
+ self.X_train = self.vectorizer.fit_transform(self.x_train).toarray()
135
+ self.X_test = self.vectorizer.transform(self.x_test).toarray()
136
+
137
+
138
+
139
+
140
+ def setup_model_ours(self, ):
141
+ self.svm_model = SVMModel()
142
+
143
+ def train_ours(self, ):
144
+ self.y_train = self.y_train.astype(np.float32)
145
+ self.y_test = self.y_test.astype(np.float32)
146
+ self.y_train = self.y_train * 2 - 1.0
147
+ self.y_test = self.y_test * 2 - 1.0
148
+
149
+ print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
150
+
151
+ self.svm_model.X_train = self.X_train
152
+ self.svm_model.X_test = self.X_test
153
+ self.svm_model.y_train = self.y_train
154
+ self.svm_model.y_test = self.y_test
155
+
156
+ self.svm_model.fit(self.X_train, self.y_train)
157
+
158
+ def test_ours(self, ):
159
+ linear_output = self.svm_model.test()
160
+ acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
161
+ print(f"Test Acc: {acc}")
162
+
163
+ weights_dict = self.svm_model.get_weights_dict()
164
+ np.save("svm_weights.npy", weights_dict)
165
+ print(f"svm weights saved to svm_weights.npy")
166
+
167
+
168
+
169
+ # def setup_model(self, ):
170
+ # self.svc = SVC()
171
+
172
+ # def train(self, ):
173
+ # self.svc.fit(self.X_train, self.y_train)
174
+
175
+ # def test(self, ):
176
+ # self.train_acc = self.svc.score(self.X_train, self.y_train)
177
+ # self.test_acc = self.svc.score(self.X_test, self.y_test)
178
+
179
+ # print(f'Train Acc: {self.train_acc * 100}\n', f'Test Acc: {self.test_acc * 100}\n')
180
+
181
+
182
+
183
+ # CUDA_VISIBLE_DEVICES=2 python log_reg.py
184
+
185
+ # y_train = np.asarray(y_train)
186
+ # y_test = np.asarray(y_test)
187
+
188
+ # print(f"After getting data")
189
+
190
+ # start_time = time.time()
191
+ # vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
192
+
193
+ # print(f"After setting the vectorizer")
194
+ # X_train = vectorizer.fit_transform(x_train).toarray()
195
+ # X_test = vectorizer.transform(x_test).toarray()
196
+
197
+ # print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
198
+
199
+ # # lr_classfier = LogisticRegression()
200
+ # # lr_classfier.fit(X_train,y_train)
201
+ # # train_acc = lr_classfier.score(X_train,y_train)
202
+ # # test_acc = lr_classfier.score(X_test,y_test)
203
+
204
+ # svc = SVC()
205
+ # svc.fit(X_train,y_train)
206
+ # train_acc = svc.score(X_train,y_train)
207
+ # test_acc = svc.score(X_test,y_test)
208
+
209
+ # print('Train Acc: %.2f' % float(train_acc*100), 'Test Acc: %.2f' % float(test_acc*100),'Time: %.4f' % float(time.time()-start_time))
210
+ # # CUDA_VISIBLE_DEVICES=2 python log_reg.py
trainer.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ # import torch.nn.functional as F
5
+ import torch.optim as optim
6
+ # from torch.autograd import Variable
7
+ #import torch.distributed as dist
8
+
9
+ # import time
10
+ import os
11
+ import re
12
+ # import sys
13
+ # import io
14
+
15
+ from tqdm import tqdm
16
+ import nltk
17
+
18
+ from lstm_model_new import LSTM_model, BiLSTMModel
19
+ from max_ent_model import MaxEntropyModel
20
+ from svm_model import SVM
21
+
22
+
23
+ class Trainer(object):
24
+ def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type):
25
+
26
+ # vocab_size = 8000
27
+ # sequence_len = 150
28
+
29
+ self.vocab_size = vocab_size
30
+ self.vocab_sizeb = self.vocab_size + 1
31
+
32
+ self.sequence_len = sequence_len
33
+ self.model_type = model_type
34
+
35
+ self.batch_size = batch_size
36
+ self.nn_epochs = nn_epochs
37
+
38
+ self.processed_data_folder = "../preprocessed_data/"
39
+
40
+ self._load_data()
41
+
42
+ self._get_model()
43
+
44
+ # self._setup_optimizer()
45
+
46
+
47
+ pass
48
+
49
+
50
+
51
+ def _load_data(self, ):
52
+
53
+ dict_fn = "yelp_dictionary.npy"
54
+
55
+ id_to_word = np.load(dict_fn, allow_pickle=True) # .item()
56
+
57
+ print(type(id_to_word))
58
+ print(id_to_word[0], len(id_to_word))
59
+
60
+ word_to_id = {
61
+ id_to_word[idx]: idx for idx in range(len(id_to_word))
62
+ }
63
+
64
+ # word_to_id = {v: k for k, v in id_to_word.items()}
65
+ self.word_to_id = word_to_id
66
+
67
+ # x_train = np.load('../preprocessed_data/x_train.npy')
68
+ # y_train = np.load('../preprocessed_data/y_train.npy')
69
+
70
+ # #x_train = x_train[:10000]
71
+ # #y_train = y_train[:10000]
72
+ # x_test = np.load('../preprocessed_data/x_test.npy')
73
+ # y_test = np.load('../preprocessed_data/y_test.npy')
74
+
75
+
76
+ # x_train_path = os.path.join(self.processed_data_folder, "x_train.npy")
77
+ # y_train_path = os.path.join(self.processed_data_folder, "y_train.npy")
78
+ # x_test_path = os.path.join(self.processed_data_folder, "x_test.npy")
79
+ # y_test_path = os.path.join(self.processed_data_folder, "y_test.npy")
80
+
81
+ # x_train = np.load(x_train_path)
82
+ # y_train = np.load(y_train_path)
83
+ # x_test = np.load(x_test_path)
84
+ # y_test = np.load(y_test_path)
85
+ # self.x_train = x_train
86
+ # self.y_train = y_train
87
+ # self.x_test = x_test
88
+ # self.y_test = y_test
89
+
90
+ def _get_model(self, ):
91
+ if self.model_type == "lstm":
92
+ self.model = LSTM_model(self.vocab_sizeb, 800)
93
+ elif self.model_type == "bilstm":
94
+ self.model = BiLSTMModel(self.vocab_sizeb, 800)
95
+ elif self.model_type == "max_ent":
96
+ self.model = MaxEntropyModel()
97
+ elif self.model_type == "svm":
98
+ self.model = SVM()
99
+ else:
100
+ raise ValueError("Model type not supported")
101
+
102
+ # self.model.cuda()
103
+
104
+ if self.model_type in ['lstm', 'bilstm']:
105
+ # self.model = self.model.cuda()
106
+
107
+ model_ckpt_fn = f"{self.model_type}.pth"
108
+ self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu')))
109
+ elif self.model_type in ['max_ent']:
110
+ model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent #
111
+ model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
112
+ features = model_params["features"]
113
+ weights = model_params["weights"]
114
+
115
+ self.model.weights = weights # .tolist()
116
+ # print(f"self.model.weights: {self.model.weights[:10]}")
117
+ self.model.last_weights = weights # .tolist()
118
+
119
+ self.model.features = features
120
+ # print(f"self.model.features: {list(self.model.features.keys())[:10]}")
121
+
122
+ elif self.model_type in ['svm']:
123
+ model_ckpt_fn = f"{self.model_type}_weights.npy"
124
+ model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
125
+ w = model_params['w']
126
+ b = model_params['b']
127
+ self.model.svm_model.w = w
128
+ self.model.svm_model.b = b
129
+
130
+ else:
131
+ raise ValueError("Model type not supported")
132
+
133
+
134
+
135
+
136
+
137
+ def _setup_optimizer(self, ):
138
+ self.lr = 0.001
139
+ self.opt = optim.Adam(self.model.parameters(), lr=self.lr)
140
+
141
+ def _train(self, ):
142
+ train_losses = []
143
+ train_accs = []
144
+ test_accs = [0.0]
145
+
146
+ for epoch in range(self.nn_epochs):
147
+ print(f"Epoch: {epoch}")
148
+ self.model.train()
149
+
150
+ nn_acc = 0
151
+ nn_total = 0
152
+ epoch_loss = 0.0
153
+
154
+
155
+ train_permutation_idxes = np.random.permutation(self.y_train.shape[0])
156
+
157
+ for i in tqdm(range(0, len(self.y_train), self.batch_size)):
158
+ batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]]
159
+ batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]]
160
+
161
+ data = torch.from_numpy(batched_x).long().cuda()
162
+ target = torch.from_numpy(batched_y).float().cuda()
163
+
164
+ self.opt.zero_grad()
165
+ loss, predicted_labels = self.model(data, target)
166
+ loss.backward()
167
+
168
+ norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0)
169
+ self.opt.step()
170
+
171
+ predicted_labels = predicted_labels >= 0
172
+ gts = target >= 0.5
173
+ acc = torch.sum((predicted_labels == gts).float()).item()
174
+
175
+ nn_acc += acc
176
+ epoch_loss += loss.item()
177
+ nn_total += len(batched_y)
178
+
179
+ train_acc = float(nn_acc) / float(nn_total)
180
+ train_loss = epoch_loss / float(self.batch_size)
181
+
182
+ train_losses.append(train_loss)
183
+ train_accs.append(train_acc)
184
+
185
+ print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}")
186
+
187
+ self._test()
188
+
189
+
190
+ def _process_text(self, input_text):
191
+ text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
192
+ tokens = nltk.word_tokenize(text)
193
+ token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
194
+ token_ids = np.array(token_ids)
195
+
196
+ token_ids[token_ids > self.vocab_size] = 0
197
+ if token_ids.shape[0] > self.sequence_len:
198
+ start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1)
199
+ token_ids = token_ids[start_index: (start_index + self.sequence_len)]
200
+ else:
201
+ token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])])
202
+ return token_ids
203
+
204
+ def _process_text_maxent(self, input_text):
205
+ text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
206
+ tokens = nltk.word_tokenize(text)
207
+ token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
208
+ # token_ids = np.array(token_ids)
209
+ token_ids = [ str(word_idx) for word_idx in token_ids ]
210
+
211
+ return token_ids
212
+
213
+ # token_ids[token_ids > self.vocab_size] = 0
214
+ # return token_ids
215
+
216
+ def _process_text_svm(self, input_text):
217
+ text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
218
+ tokens = self.model.vectorizer.transform([text]).toarray()
219
+ # tokens = nltk.word_tokenize(text)
220
+ # token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
221
+ # # token_ids = np.array(token_ids)
222
+ # token_ids = [ str(word_idx) for word_idx in token_ids ]
223
+
224
+ return tokens
225
+
226
+ def predict_maxent(self, input_text):
227
+
228
+ text_ids = self._process_text_maxent(input_text)
229
+
230
+ prob = self.model.calculate_probability(text_ids)
231
+ prob.sort(reverse=True)
232
+ # print(label, prob)
233
+ print(prob)
234
+ ##### Calculate whether the prediction is correct #####
235
+ maxx_prob_idx = int(prob[0][1])
236
+
237
+ # data = torch.from_numpy(text_ids).long() # .cuda()
238
+ # data = data.unsqueeze(0)
239
+
240
+
241
+ # target = torch.zeros((data.size(0), ), dtype=torch.float)
242
+
243
+ # # print(f"data: {data.shape}, target: {target.shape}")
244
+
245
+ # with torch.no_grad():
246
+ # loss, predicted_labels = self.model(data, target)
247
+ # predicted_labels = predicted_labels >= 0
248
+
249
+ if maxx_prob_idx == 2:
250
+ return "Positive"
251
+ else:
252
+ return "Negative"
253
+
254
+ def predict_svm(self, input_text):
255
+
256
+ text_ids = self._process_text_svm(input_text)
257
+
258
+ predicted_label = self.model.svm_model.predict(text_ids)
259
+
260
+ if float(predicted_label[0]) > 0:
261
+ return "Positive"
262
+ else:
263
+ return "Negative"
264
+
265
+ # prob = self.model.calculate_probability(text_ids)
266
+ # prob.sort(reverse=True)
267
+ # # print(label, prob)
268
+ # print(prob)
269
+ # ##### Calculate whether the prediction is correct #####
270
+ # maxx_prob_idx = int(prob[0][1])
271
+
272
+ # # data = torch.from_numpy(text_ids).long() # .cuda()
273
+ # # data = data.unsqueeze(0)
274
+
275
+
276
+ # # target = torch.zeros((data.size(0), ), dtype=torch.float)
277
+
278
+ # # # print(f"data: {data.shape}, target: {target.shape}")
279
+
280
+ # # with torch.no_grad():
281
+ # # loss, predicted_labels = self.model(data, target)
282
+ # # predicted_labels = predicted_labels >= 0
283
+
284
+ # if maxx_prob_idx == 2:
285
+ # return "Positive"
286
+ # else:
287
+ # return "Negative"
288
+
289
+
290
+ def predict(self, input_text):
291
+
292
+ text_ids = self._process_text(input_text)
293
+
294
+ data = torch.from_numpy(text_ids).long() # .cuda()
295
+ data = data.unsqueeze(0)
296
+
297
+
298
+ target = torch.zeros((data.size(0), ), dtype=torch.float)
299
+
300
+ # print(f"data: {data.shape}, target: {target.shape}")
301
+
302
+ with torch.no_grad():
303
+ loss, predicted_labels = self.model(data, target)
304
+ predicted_labels = predicted_labels >= 0
305
+
306
+ if predicted_labels.item():
307
+ return "Positive"
308
+ else:
309
+ return "Negative"
310
+
311
+ # return predicted_labels.item()
312
+
313
+
314
+ def _test(self, ):
315
+ self.model.eval()
316
+
317
+ nn_acc = 0
318
+ loss = 0
319
+
320
+ nn_total = 0
321
+
322
+ test_permutation_idxes = np.random.permutation(self.y_test.shape[0])
323
+ for i in tqdm(range(0, len(self.y_test), self.batch_size)):
324
+ batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]]
325
+ batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]]
326
+
327
+ data = torch.from_numpy(batched_x).long().cuda()
328
+ target = torch.from_numpy(batched_y).float().cuda()
329
+
330
+ with torch.no_grad():
331
+ loss, predicted_labels = self.model(data, target)
332
+
333
+ predicted_labels = predicted_labels >= 0
334
+ gts = target >= 0.5
335
+ acc = torch.sum((predicted_labels == gts).float()).item()
336
+
337
+ nn_acc += acc
338
+ nn_total += len(batched_y)
339
+
340
+ acc = float(nn_acc) / float(nn_total)
341
+ print(f"Test Acc: {acc}")
342
+
343
+ if __name__=='__main__':
344
+
345
+ vocab_size = 8000
346
+ sequence_len = 150
347
+
348
+ # batch_size = 1024
349
+ batch_size = 256
350
+ nn_epochs = 20
351
+ model_type = "lstm"
352
+
353
+ model_type = "bilstm"
354
+
355
+ trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
356
+ trainer._train()
357
+
358
+ # CUDA_VISIBLE_DEVICES=0 python trainer.py