Spaces:

xymeow7
/

text-classification-demo-2

Sleeping

App Files Files Community

meow commited on Jun 6, 2024

Commit

ab2adfb

1 Parent(s): 85a9328

Add application file

Browse files

Files changed (10) hide show

.gitignore +146 -0
README.md +1 -12
app.py +103 -0
app_local.py +125 -0
lstm_model_new.py +193 -0
max_ent_model.py +139 -0
pre-requirements.txt +4 -0
requirements.txt +27 -0
svm_model.py +210 -0
trainer.py +358 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,146 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.csv
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+*.npy
+*.pth
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# /root/diffsim/Yelp-Review-Sentiment-Analysis/yelp_review_polarity_csv
+./yelp_review_polarity_csv/*
+# /root/diffsim/Yelp-Review-Sentiment-Analysis/preprocessed_data
+./preprocessed_data/*
+*/*.npy
+*/*.csv
+*/*.zip
+*/*.txt
+*/*.model
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

README.md CHANGED Viewed

@@ -1,12 +1 @@
----
-title: Text Classification
-emoji: 📉
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 4.32.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Text Classification

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import numpy as np
+import gradio as gr
+import os
+import tempfile
+import shutil
+from trainer import Trainer
+def predict(input_text):
+    predicted_label = trainer.predict(input_text )
+    return str(predicted_label)
+    # pass
+def predict_maxent(input_text):
+    predicted_label = trainer_maxent.predict_maxent(input_text )
+    return str(predicted_label)
+    # pass
+def predict_svm(input_text):
+    predicted_label = trainer_svm.predict_svm(input_text )
+    return str(predicted_label)
+    # pass
+def create_demo():
+    USAGE = """## Text Classification
+    """
+    with gr.Blocks() as demo:
+        gr.Markdown(USAGE)
+        # demo =
+        # gr.Interface(
+        #     predict,
+        #     # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
+        #     gr.File(type="filepath"),
+        #     gr.File(type="filepath"),
+        #     cache_examples=False
+        # )
+        # input_file = gr.File(type="filepath")
+        # output_file = gr.File(type="filepath")
+        gr.Interface(fn=predict, inputs="textbox", outputs="textbox")
+        gr.Interface(fn=predict_maxent, inputs="textbox", outputs="textbox")
+        gr.Interface(fn=predict_svm, inputs="textbox", outputs="textbox")
+        # gr.Interface(
+        #     predict,
+        #     # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
+        #     input_file,
+        #     output_file,
+        #     cache_examples=False
+        # )
+        # inputs = input_file
+        # outputs = output_file
+        # gr.Examples(
+        #     examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
+        #     inputs=inputs,
+        #     fn=predict,
+        #     outputs=outputs,
+        # )
+    return demo
+if __name__ == "__main__":
+    vocab_size = 8000
+    sequence_len = 150
+    # batch_size = 1024
+    batch_size = 256
+    nn_epochs = 20
+    model_type = "lstm"
+    # model_type = "bilstm"
+    trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    model_type = "max_ent"
+    trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    model_type = "svm"
+    trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    demo = create_demo()
+    demo.launch()

app_local.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+# import gradio as gr
+import os
+import tempfile
+import shutil
+from trainer import Trainer
+def predict(input_text, model_type):
+    if model_type in ['lstm', 'bilstm']:
+        predicted_label = trainer.predict(input_text )
+    elif model_type == 'max_ent':
+        predicted_label = trainer.predict_maxent(input_text)
+    elif model_type == 'svm':
+        predicted_label = trainer.predict_svm(input_text)
+    return str(predicted_label)
+    # pass
+def predict_omni(input_text, model_type):
+    predicted_label_net = trainer.predict(input_text )
+    predicted_label_maxent = trainer_maxent.predict_maxent(input_text )
+    predicted_label_svm = trainer_svm.predict_svm(input_text )
+    # if model_type in ['lstm', 'bilstm']:
+    #     predicted_label = trainer.predict(input_text )
+    # elif model_type == 'max_ent':
+    #     predicted_label = trainer.predict_maxent(input_text)
+    # elif model_type == 'svm':
+    #     predicted_label = trainer.predict_svm(input_text)
+    predicted_text = f"LSTM: {predicted_label_net}, Max Ent: {predicted_label_maxent}, SVM: {predicted_label_svm}"
+    return predicted_text
+    # pass
+def create_demo():
+    USAGE = """## Text Classification
+    """
+    with gr.Blocks() as demo:
+        gr.Markdown(USAGE)
+        # demo =
+        # gr.Interface(
+        #     predict,
+        #     # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
+        #     gr.File(type="filepath"),
+        #     gr.File(type="filepath"),
+        #     cache_examples=False
+        # )
+        input_file = gr.File(type="filepath")
+        output_file = gr.File(type="filepath")
+        gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
+        # gr.Interface(
+        #     predict,
+        #     # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
+        #     input_file,
+        #     output_file,
+        #     cache_examples=False
+        # )
+        # inputs = input_file
+        # outputs = output_file
+        # gr.Examples(
+        #     examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
+        #     inputs=inputs,
+        #     fn=predict,
+        #     outputs=outputs,
+        # )
+    return demo
+if __name__ == "__main__":
+    vocab_size = 8000
+    sequence_len = 150
+    # batch_size = 1024
+    batch_size = 256
+    nn_epochs = 20
+    model_type = "lstm"
+    # model_type = "bilstm"
+    # model_type = "max_ent"
+    # trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    # print(f"Trainer loaded")
+    model_type = "lstm"
+    trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    model_type = "max_ent"
+    trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    model_type = "svm"
+    trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    while True:
+        input_text = input()
+        # if model_type in ["lstm", "bilstm"]:
+        # label = predict(input_text, model_type)
+        label = predict_omni(input_text, model_type)
+        # elif model_type in ["max_ent"]:
+        #     label =
+        print(label)
+    # demo = create_demo()
+    # demo.launch()
+    # python app_local.py

lstm_model_new.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.autograd import Variable
+import torch.distributed as dist
+import math
+class LSTMCell(nn.Module):
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(LSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.x2h = nn.Linear(input_size, 4 * hidden_size, bias=bias)
+        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1.0 / math.sqrt(self.hidden_size)
+        for w in self.parameters():
+            w.data.uniform_(-std, std)
+    def forward(self, x, hidden):
+        hx, cx = hidden
+        x = x.view(-1, x.size(1))
+        gates = self.x2h(x) + self.h2h(hx)
+        # print(f"gates: {gates.shape}")
+        # gates = gates.squeeze()
+        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+        ingate = F.sigmoid(ingate)
+        forgetgate = F.sigmoid(forgetgate)
+        cellgate = F.tanh(cellgate)
+        outgate = F.sigmoid(outgate)
+        cy = torch.mul(cx, forgetgate) +  torch.mul(ingate, cellgate)
+        hy = torch.mul(outgate, F.tanh(cy))
+        return (hy, cy)
+class LSTMModel(nn.Module):
+    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
+        super(LSTMModel, self).__init__()
+        # Hidden dimensions
+        self.hidden_dim = hidden_dim
+        # Number of hidden layers
+        self.layer_dim = layer_dim
+        self.lstm = LSTMCell(input_dim, hidden_dim, layer_dim)
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        # Initialize hidden state with zeros
+        #######################
+        #  USE GPU FOR MODEL  #
+        #######################
+        #print(x.shape,"x.shape")100, 28, 28
+        if torch.cuda.is_available():
+            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
+        else:
+            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
+        # Initialize cell state
+        if torch.cuda.is_available():
+            c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
+        else:
+            c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
+        outs = []
+        cn = c0[0,:,:]
+        hn = h0[0,:,:]
+        for seq in range(x.size(1)):
+            hn, cn = self.lstm(x[:,seq,:], (hn,cn))
+            outs.append(hn)
+        out = outs[-1] # .squeeze()
+        out = self.fc(out)
+        # out.size() --> 100, 10
+        return out
+class LSTM_model(nn.Module):
+    def __init__(self, vocab_size, n_hidden):
+        super(LSTM_model, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, n_hidden)
+        self.lstm = LSTMModel(n_hidden, n_hidden, n_hidden, n_hidden)
+        self.fc_output = nn.Linear(n_hidden, 1)
+        self.loss = nn.BCEWithLogitsLoss()
+    def forward(self, X, t, train=True):
+        embed = self.embedding(X) # batch_size, time_steps, features
+        no_of_timesteps = embed.shape[1]
+        n_hidden = embed.shape[2]
+        input = embed
+        # print(f"input: {input.shape}")
+        fc_out = self.lstm(input) ## bsz x nnhidden_dim
+        # print(f"fc_out: {fc_out.size()}")
+        h = self.fc_output(fc_out)
+        # print(f"h: {h.size()}")
+        return self.loss(h[:, 0], t), h[:, 0]
+class BiLSTM(nn.Module):
+    def __init__(self, input_size, hidden_size, bias=True):
+        super(BiLSTM, self).__init__()
+        self.forward_cell = LSTMCell(input_size, hidden_size, bias)
+        self.backward_cell = LSTMCell(input_size, hidden_size, bias)
+    def forward(self, input_seq):
+        forward_outputs = []
+        backward_outputs = []
+        forward_hidden = (torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device),
+                          torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device))
+        backward_hidden = (torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device),
+                           torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device))
+        for t in range(input_seq.size(1)):
+            forward_hidden = self.forward_cell(input_seq[:, t], forward_hidden)
+            forward_outputs.append(forward_hidden[0])
+        for t in range(input_seq.size(1)-1, -1, -1):
+            backward_hidden = self.backward_cell(input_seq[:, t], backward_hidden)
+            backward_outputs.append(backward_hidden[0])
+        forward_outputs = torch.stack(forward_outputs, dim=1)
+        backward_outputs = torch.stack(backward_outputs, dim=1)
+        outputs = torch.cat((forward_outputs, backward_outputs), dim=2)
+        return outputs
+class BiLSTMModel(nn.Module):
+    def __init__(self, vocab_size, n_hidden):
+        super(BiLSTMModel, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, n_hidden)
+        self.bilstm = BiLSTM(n_hidden, n_hidden)
+        self.fc_output = nn.Linear(2*n_hidden, 1)
+        self.loss = nn.BCEWithLogitsLoss()
+    def forward(self, X, t, train=True):
+        embed = self.embedding(X) # batch_size, time_steps, features
+        no_of_timesteps = embed.shape[1]
+        n_hidden = embed.shape[2]
+        input = embed
+        bilstm_out = self.bilstm(input) ## bsz x nnhidden_dim
+        bilstm_out = bilstm_out[:, -1, :]
+        h = self.fc_output(bilstm_out)
+        # print(f"bilstm_out: {bilstm_out.shape}, h: {h.shape}, t: {t.shape}")
+        return self.loss(h[:,0], t), h[:, 0]

max_ent_model.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import math
+class MaxEntropyModel:
+    def __init__(self, ):
+        self.train_set = []
+        self.features = {}
+        self.labels = {}
+        self.labels = {
+            '1': 1, '2': 1
+        }
+    def load_data(self, fn):
+        with open(fn, "r") as rf:
+            for line in rf:
+                label, review = line.strip().split(',')
+                label = label[1: -1]
+                review = review.split(' ')
+                fields = [str(int(label))] + review
+                if review != '':
+                    label = str(int(label))
+                    self.labels[label] = 1
+                    for s in set(fields[1:]):
+                        if (label, s) not in self.features:
+                            self.features[(label, s)] = 1
+                        else:
+                            self.features[(label, s)] += 1
+                    self.train_set.append(fields)
+            rf.close()
+    def initialize_parameters(self, ):
+        self.train_set_size = len(self.train_set)
+        self.M = max([len(record)-1 for record in self.train_set])
+        self.ep = [0.0 for _ in range(len(self.features))]
+        for i_f, feat in enumerate(self.features):
+            self.ep[i_f] = float(self.features[feat]) / float(self.train_set_size)
+            self.features[feat] = i_f
+        self.weights = [0.0 for _ in range(len(self.features))]
+        self.last_weights = self.weights
+    def get_prob_weight(self, features, label):
+        weight = 0.0
+        for feat in features:
+            # print(label, feat)
+            if (label, feat) in self.features:
+                weight += self.weights[self.features[(label, feat)]]
+        prob_weight = math.exp(weight)
+        # print(f"label: {label}, prob_weight: {prob_weight}")
+        return prob_weight
+    def get_expected_features(self, ):
+        expected_features = [0.0 for _ in range(len(self.features))]
+        for record in self.train_set:
+            features = record[1:]
+            prob = self.calculate_probability(features)
+            for feat in features:
+                for w, l in prob:
+                    if (l, feat) in self.features:
+                        idx = self.features[(l, feat)]
+                        expected_features[idx] += w * (1.0 / self.train_set_size)
+        return expected_features
+    def calculate_probability(self, features):
+        weights = [(self.get_prob_weight(features, l), l) for l in self.labels]
+        tot_weights = [w for w, l in weights]
+        Z = sum(tot_weights)
+        prob = [(w / Z, l) for w, l in weights]
+        return prob
+    def train(self, max_iter=10000):
+        self.initialize_parameters()
+        for i in range(max_iter):
+            print(f"[Training] iter {i + 1} ...")
+            self.new_ep = self.get_expected_features()
+            self.last_weights = self.weights[:]
+            for i, w in enumerate(self.weights):
+                delta = 1.0 / self.M * math.log(self.ep[i] / self.new_ep[i])
+                self.weights[i] = self.weights[i] + delta
+            if i % 10 == 0:
+                test_data_path = "../preprocessed_data/yelp_test.txt"
+                print(f"Start testing...")
+                self.test(test_data_path)
+    def test(self, test_data_path):
+        f = open(file=test_data_path)
+        tot_test_nn = 0
+        correct_test_nn = 0
+        for line in f:
+            label, review = line.strip().split(',')
+            label = label[1: -1]
+            review = review.split(' ')
+            # fields = [str(int(label))] + review ## get split review ## #
+            # input text: review #
+            # output: label #
+            # review #
+            prob = self.calculate_probability(review)
+            prob.sort(reverse=True)
+            print(label, prob)
+            ##### Calculate whether the prediction is correct #####
+            maxx_prob_idx = int(prob[0][1])
+            label_idx = int(label)
+            if maxx_prob_idx == label_idx:
+                correct_test_nn += 1
+            tot_test_nn += 1
+            ##### Calculate whether the prediction is correct #####
+        f.close()
+        acc = float(correct_test_nn) / float(tot_test_nn)
+        print(f"[Test] Acc: {acc}")
+    def save_ckpt(self, sv_ckpt_path):
+        sv_features = self.features
+        sv_weights = self.last_weights
+        sv_ckpt = {
+            'features': sv_features,
+            'weights': sv_weights
+        }
+        np.save(sv_ckpt_path, sv_ckpt)
+        print(f"ckpt with features and weights saved to {sv_ckpt_path}")

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+# pip==23.3.2
+# torch==2.2.0
+-i https://download.pytorch.org/whl/cpu
+torch==2.2.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# -f https://download.pytorch.org/whl/cpu/torch_stable.html
+# -f https://data.pyg.org/whl/torch-2.2.0%2Bcpu.html
+# -i https://download.pytorch.org/whl/cpu
+# pip==20.2.4
+# torch==2.2.0
+# torchvision==0.13.1
+# torchaudio==0.12.1
+tqdm
+nltk
+scikit-learn
+scipy
+# blobfile==2.0.1
+# manopth @ git+https://github.com/hassony2/manopth.git
+# numpy==1.23.1
+# psutil==5.9.2
+# scikit-learn
+# scipy==1.9.3
+# tensorboard
+# tensorboardx
+# tqdm
+# trimesh
+# clip
+# chumpy
+# opencv-python

svm_model.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import numpy as np
+import re
+import time
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer
+# from sklearn.linear_model import LogisticRegression
+# from sklearn.svm import SVC
+import ssl
+import os
+import nltk
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# print(f"nltk version: {nltk.__version__}")
+# nltk.download('stopwords')
+#
+class SVMModel:
+    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=1000):
+        self.learning_rate = learning_rate
+        self.lambda_param = lambda_param
+        self.n_iters = n_iters
+        self.w = None
+        self.b = None
+        self.X_train = None
+        self.X_test = None
+        self.y_train = None
+        self.y_test = None
+    def fit(self, X, y):
+        n_samples, n_features = X.shape
+        y_ = np.where(y <= 0, -1, 1)  # Convert labels to -1 and 1
+        print(f"y_ max: {np.max(y_)}, y_ min: {np.min(y_)}")
+        self.w = np.zeros(n_features)
+        self.b = 0
+        self.lambda_param = 1.0 / float(n_samples)
+        for _ in range(self.n_iters):
+            print(f"Epoch: {_}")
+            for idx, x_i in enumerate(X):
+                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
+                if condition:
+                    self.w = self.w -  self.learning_rate * (2 * self.lambda_param * self.w)
+                else:
+                    self.w = self.w  - self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
+                    self.b = self.b - self.learning_rate * y_[idx]
+            if _ % 1 == 0:
+                # print(f"Iteration: {_}")
+                st_time = time.time()
+                self.test()
+                print(f"Time: {time.time() - st_time}")
+    def predict(self, X):
+        linear_output = np.matmul(X, self.w[:, None]) - self.b # []
+        return np.sign(linear_output[:, 0])
+    def test(self, ):
+        # test_ours(self, ):
+        linear_output = self.predict(self.X_test)
+        print(f"linear_output: {linear_output.shape}, self.X_test: {self.X_test.shape}")
+        acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
+        print(f"Test Acc: {acc}")
+        return linear_output
+    # weights_dict = self.svm_model.get_weights_dict()
+    def get_weights_dict(self, ):
+        weights_dict = {
+            'w': self.w,
+            'b': self.b
+        }
+        return weights_dict
+class SVM:
+    def __init__(self, ):
+        # file_path =
+        self.x_train = []
+        self.y_train = []
+        self.x_test = []
+        self.y_test = []
+        self.data_folder = '.'
+        print(f"Start loading data")
+        self._load_data()
+        print(f"Setting vectorizer")
+        self.vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
+        print(f"Start preprocessing data")
+        self._preprocess_data()
+        # self.setup_model()
+        self.setup_model_ours()
+        pass
+    def _load_data(self, ):
+        file_path = '.'
+        x_train = []
+        y_train = []
+        with open(os.path.join(self.data_folder, 'train.csv'), "r") as f:
+            for line in f:
+                l = line.strip().split(',')
+                senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
+                x_train.append(text)
+                y_train.append(int(senti[1]) - 1)
+            f.close()
+        x_test = []
+        y_test = []
+        with open(os.path.join(self.data_folder, 'test.csv'), "r") as f:
+            for line in f:
+                l = line.strip().split(',')
+                senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
+                x_test.append(text)
+                y_test.append(int(senti[1]) - 1)
+            f.close()
+        self.x_train = x_train
+        self.x_test = x_test
+        self.y_train = np.array(y_train, dtype=np.int32)
+        self.y_test = np.array(y_test, dtype=np.int32)
+        print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
+    def _preprocess_data(self, ):
+        self.X_train = self.vectorizer.fit_transform(self.x_train).toarray()
+        self.X_test = self.vectorizer.transform(self.x_test).toarray()
+    def setup_model_ours(self, ):
+        self.svm_model = SVMModel()
+    def train_ours(self, ):
+        self.y_train = self.y_train.astype(np.float32)
+        self.y_test = self.y_test.astype(np.float32)
+        self.y_train = self.y_train * 2 - 1.0
+        self.y_test = self.y_test * 2 - 1.0
+        print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
+        self.svm_model.X_train = self.X_train
+        self.svm_model.X_test = self.X_test
+        self.svm_model.y_train = self.y_train
+        self.svm_model.y_test = self.y_test
+        self.svm_model.fit(self.X_train, self.y_train)
+    def test_ours(self, ):
+        linear_output = self.svm_model.test()
+        acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
+        print(f"Test Acc: {acc}")
+        weights_dict = self.svm_model.get_weights_dict()
+        np.save("svm_weights.npy", weights_dict)
+        print(f"svm weights saved to svm_weights.npy")
+    # def setup_model(self, ):
+    #     self.svc = SVC()
+    # def train(self, ):
+    #     self.svc.fit(self.X_train, self.y_train)
+    # def test(self, ):
+    #     self.train_acc = self.svc.score(self.X_train, self.y_train)
+    #     self.test_acc = self.svc.score(self.X_test, self.y_test)
+    #     print(f'Train Acc: {self.train_acc * 100}\n', f'Test Acc: {self.test_acc * 100}\n')
+# CUDA_VISIBLE_DEVICES=2 python log_reg.py
+# y_train = np.asarray(y_train)
+# y_test = np.asarray(y_test)
+# print(f"After getting data")
+# start_time = time.time()
+# vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
+# print(f"After setting the vectorizer")
+# X_train = vectorizer.fit_transform(x_train).toarray()
+# X_test = vectorizer.transform(x_test).toarray()
+# print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
+# # lr_classfier = LogisticRegression()
+# # lr_classfier.fit(X_train,y_train)
+# # train_acc = lr_classfier.score(X_train,y_train)
+# # test_acc = lr_classfier.score(X_test,y_test)
+# svc = SVC()
+# svc.fit(X_train,y_train)
+# train_acc = svc.score(X_train,y_train)
+# test_acc = svc.score(X_test,y_test)
+# print('Train Acc: %.2f' % float(train_acc*100), 'Test Acc: %.2f' % float(test_acc*100),'Time: %.4f' % float(time.time()-start_time))
+# # CUDA_VISIBLE_DEVICES=2 python log_reg.py

trainer.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import numpy as np
+import torch
+import torch.nn as nn
+# import torch.nn.functional as F
+import torch.optim as optim
+# from torch.autograd import Variable
+#import torch.distributed as dist
+# import time
+import os
+import re
+# import sys
+# import io
+from tqdm import tqdm
+import nltk
+from lstm_model_new import LSTM_model, BiLSTMModel
+from max_ent_model import MaxEntropyModel
+from svm_model import SVM
+class Trainer(object):
+    def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type):
+        # vocab_size = 8000
+        # sequence_len = 150
+        self.vocab_size = vocab_size
+        self.vocab_sizeb = self.vocab_size + 1
+        self.sequence_len = sequence_len
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.nn_epochs = nn_epochs
+        self.processed_data_folder = "../preprocessed_data/"
+        self._load_data()
+        self._get_model()
+        # self._setup_optimizer()
+        pass
+    def _load_data(self, ):
+        dict_fn = "yelp_dictionary.npy"
+        id_to_word = np.load(dict_fn, allow_pickle=True) # .item()
+        print(type(id_to_word))
+        print(id_to_word[0], len(id_to_word))
+        word_to_id = {
+            id_to_word[idx]: idx for idx in range(len(id_to_word))
+        }
+        # word_to_id = {v: k for k, v in id_to_word.items()}
+        self.word_to_id = word_to_id
+        # x_train = np.load('../preprocessed_data/x_train.npy')
+        # y_train = np.load('../preprocessed_data/y_train.npy')
+        # #x_train = x_train[:10000]
+        # #y_train = y_train[:10000]
+        # x_test = np.load('../preprocessed_data/x_test.npy')
+        # y_test = np.load('../preprocessed_data/y_test.npy')
+        # x_train_path = os.path.join(self.processed_data_folder, "x_train.npy")
+        # y_train_path = os.path.join(self.processed_data_folder, "y_train.npy")
+        # x_test_path = os.path.join(self.processed_data_folder, "x_test.npy")
+        # y_test_path = os.path.join(self.processed_data_folder, "y_test.npy")
+        # x_train = np.load(x_train_path)
+        # y_train = np.load(y_train_path)
+        # x_test = np.load(x_test_path)
+        # y_test = np.load(y_test_path)
+        # self.x_train = x_train
+        # self.y_train = y_train
+        # self.x_test = x_test
+        # self.y_test = y_test
+    def _get_model(self, ):
+        if self.model_type == "lstm":
+            self.model = LSTM_model(self.vocab_sizeb, 800)
+        elif self.model_type == "bilstm":
+            self.model = BiLSTMModel(self.vocab_sizeb, 800)
+        elif self.model_type == "max_ent":
+            self.model = MaxEntropyModel()
+        elif self.model_type == "svm":
+            self.model = SVM()
+        else:
+            raise ValueError("Model type not supported")
+        # self.model.cuda()
+        if self.model_type in ['lstm', 'bilstm']:
+            # self.model = self.model.cuda()
+            model_ckpt_fn = f"{self.model_type}.pth"
+            self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu')))
+        elif self.model_type in ['max_ent']:
+            model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent #
+            model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
+            features = model_params["features"]
+            weights = model_params["weights"]
+            self.model.weights = weights # .tolist()
+            # print(f"self.model.weights: {self.model.weights[:10]}")
+            self.model.last_weights = weights # .tolist()
+            self.model.features = features
+            # print(f"self.model.features: {list(self.model.features.keys())[:10]}")
+        elif self.model_type in ['svm']:
+            model_ckpt_fn = f"{self.model_type}_weights.npy"
+            model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
+            w = model_params['w']
+            b = model_params['b']
+            self.model.svm_model.w = w
+            self.model.svm_model.b = b
+        else:
+            raise ValueError("Model type not supported")
+    def _setup_optimizer(self, ):
+        self.lr = 0.001
+        self.opt = optim.Adam(self.model.parameters(), lr=self.lr)
+    def _train(self, ):
+        train_losses = []
+        train_accs = []
+        test_accs = [0.0]
+        for epoch in range(self.nn_epochs):
+            print(f"Epoch: {epoch}")
+            self.model.train()
+            nn_acc = 0
+            nn_total = 0
+            epoch_loss = 0.0
+            train_permutation_idxes = np.random.permutation(self.y_train.shape[0])
+            for i in tqdm(range(0, len(self.y_train), self.batch_size)):
+                batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]]
+                batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]]
+                data = torch.from_numpy(batched_x).long().cuda()
+                target = torch.from_numpy(batched_y).float().cuda()
+                self.opt.zero_grad()
+                loss, predicted_labels = self.model(data, target)
+                loss.backward()
+                norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0)
+                self.opt.step()
+                predicted_labels = predicted_labels >= 0
+                gts = target >= 0.5
+                acc = torch.sum((predicted_labels == gts).float()).item()
+                nn_acc += acc
+                epoch_loss += loss.item()
+                nn_total += len(batched_y)
+            train_acc = float(nn_acc) / float(nn_total)
+            train_loss = epoch_loss / float(self.batch_size)
+            train_losses.append(train_loss)
+            train_accs.append(train_acc)
+            print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}")
+            self._test()
+    def _process_text(self, input_text):
+        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
+        tokens = nltk.word_tokenize(text)
+        token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
+        token_ids = np.array(token_ids)
+        token_ids[token_ids > self.vocab_size] = 0
+        if token_ids.shape[0] > self.sequence_len:
+            start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1)
+            token_ids = token_ids[start_index: (start_index + self.sequence_len)]
+        else:
+            token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])])
+        return token_ids
+    def _process_text_maxent(self, input_text):
+        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
+        tokens = nltk.word_tokenize(text)
+        token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
+        # token_ids = np.array(token_ids)
+        token_ids = [ str(word_idx) for word_idx in token_ids ]
+        return token_ids
+        # token_ids[token_ids > self.vocab_size] = 0
+        # return token_ids
+    def _process_text_svm(self, input_text):
+        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
+        tokens = self.model.vectorizer.transform([text]).toarray()
+        # tokens = nltk.word_tokenize(text)
+        # token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
+        # # token_ids = np.array(token_ids)
+        # token_ids = [ str(word_idx) for word_idx in token_ids ]
+        return tokens
+    def predict_maxent(self, input_text):
+        text_ids = self._process_text_maxent(input_text)
+        prob = self.model.calculate_probability(text_ids)
+        prob.sort(reverse=True)
+        # print(label, prob)
+        print(prob)
+        ##### Calculate whether the prediction is correct #####
+        maxx_prob_idx = int(prob[0][1])
+        # data = torch.from_numpy(text_ids).long() # .cuda()
+        # data = data.unsqueeze(0)
+        # target = torch.zeros((data.size(0), ), dtype=torch.float)
+        # # print(f"data: {data.shape}, target: {target.shape}")
+        # with torch.no_grad():
+        #     loss, predicted_labels = self.model(data, target)
+        # predicted_labels = predicted_labels >= 0
+        if maxx_prob_idx == 2:
+            return "Positive"
+        else:
+            return "Negative"
+    def predict_svm(self, input_text):
+        text_ids = self._process_text_svm(input_text)
+        predicted_label = self.model.svm_model.predict(text_ids)
+        if float(predicted_label[0]) > 0:
+            return "Positive"
+        else:
+            return "Negative"
+        # prob = self.model.calculate_probability(text_ids)
+        # prob.sort(reverse=True)
+        # # print(label, prob)
+        # print(prob)
+        # ##### Calculate whether the prediction is correct #####
+        # maxx_prob_idx = int(prob[0][1])
+        # # data = torch.from_numpy(text_ids).long() # .cuda()
+        # # data = data.unsqueeze(0)
+        # # target = torch.zeros((data.size(0), ), dtype=torch.float)
+        # # # print(f"data: {data.shape}, target: {target.shape}")
+        # # with torch.no_grad():
+        # #     loss, predicted_labels = self.model(data, target)
+        # # predicted_labels = predicted_labels >= 0
+        # if maxx_prob_idx == 2:
+        #     return "Positive"
+        # else:
+        #     return "Negative"
+    def predict(self, input_text):
+        text_ids = self._process_text(input_text)
+        data = torch.from_numpy(text_ids).long() # .cuda()
+        data = data.unsqueeze(0)
+        target = torch.zeros((data.size(0), ), dtype=torch.float)
+        # print(f"data: {data.shape}, target: {target.shape}")
+        with torch.no_grad():
+            loss, predicted_labels = self.model(data, target)
+        predicted_labels = predicted_labels >= 0
+        if predicted_labels.item():
+            return "Positive"
+        else:
+            return "Negative"
+        # return predicted_labels.item()
+    def _test(self, ):
+        self.model.eval()
+        nn_acc = 0
+        loss = 0
+        nn_total = 0
+        test_permutation_idxes = np.random.permutation(self.y_test.shape[0])
+        for i in tqdm(range(0, len(self.y_test), self.batch_size)):
+            batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]]
+            batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]]
+            data = torch.from_numpy(batched_x).long().cuda()
+            target = torch.from_numpy(batched_y).float().cuda()
+            with torch.no_grad():
+                loss, predicted_labels = self.model(data, target)
+            predicted_labels = predicted_labels >= 0
+            gts = target >= 0.5
+            acc = torch.sum((predicted_labels == gts).float()).item()
+            nn_acc += acc
+            nn_total += len(batched_y)
+        acc = float(nn_acc) / float(nn_total)
+        print(f"Test Acc: {acc}")
+if __name__=='__main__':
+    vocab_size = 8000
+    sequence_len = 150
+    # batch_size = 1024
+    batch_size = 256
+    nn_epochs = 20
+    model_type = "lstm"
+    model_type = "bilstm"
+    trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
+    trainer._train()
+    # CUDA_VISIBLE_DEVICES=0 python trainer.py