Spaces:
Sleeping
Sleeping
meow
commited on
Commit
•
ab2adfb
1
Parent(s):
85a9328
Add application file
Browse files- .gitignore +146 -0
- README.md +1 -12
- app.py +103 -0
- app_local.py +125 -0
- lstm_model_new.py +193 -0
- max_ent_model.py +139 -0
- pre-requirements.txt +4 -0
- requirements.txt +27 -0
- svm_model.py +210 -0
- trainer.py +358 -0
.gitignore
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
*.csv
|
31 |
+
|
32 |
+
# PyInstaller
|
33 |
+
# Usually these files are written by a python script from a template
|
34 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
35 |
+
*.manifest
|
36 |
+
*.spec
|
37 |
+
|
38 |
+
*.npy
|
39 |
+
*.pth
|
40 |
+
|
41 |
+
# Installer logs
|
42 |
+
pip-log.txt
|
43 |
+
pip-delete-this-directory.txt
|
44 |
+
|
45 |
+
# /root/diffsim/Yelp-Review-Sentiment-Analysis/yelp_review_polarity_csv
|
46 |
+
./yelp_review_polarity_csv/*
|
47 |
+
# /root/diffsim/Yelp-Review-Sentiment-Analysis/preprocessed_data
|
48 |
+
./preprocessed_data/*
|
49 |
+
|
50 |
+
*/*.npy
|
51 |
+
*/*.csv
|
52 |
+
*/*.zip
|
53 |
+
*/*.txt
|
54 |
+
|
55 |
+
*/*.model
|
56 |
+
|
57 |
+
# Unit test / coverage reports
|
58 |
+
htmlcov/
|
59 |
+
.tox/
|
60 |
+
.nox/
|
61 |
+
.coverage
|
62 |
+
.coverage.*
|
63 |
+
.cache
|
64 |
+
nosetests.xml
|
65 |
+
coverage.xml
|
66 |
+
*.cover
|
67 |
+
*.py,cover
|
68 |
+
.hypothesis/
|
69 |
+
.pytest_cache/
|
70 |
+
|
71 |
+
# Translations
|
72 |
+
*.mo
|
73 |
+
*.pot
|
74 |
+
|
75 |
+
# Django stuff:
|
76 |
+
*.log
|
77 |
+
local_settings.py
|
78 |
+
db.sqlite3
|
79 |
+
db.sqlite3-journal
|
80 |
+
|
81 |
+
# Flask stuff:
|
82 |
+
instance/
|
83 |
+
.webassets-cache
|
84 |
+
|
85 |
+
# Scrapy stuff:
|
86 |
+
.scrapy
|
87 |
+
|
88 |
+
# Sphinx documentation
|
89 |
+
docs/_build/
|
90 |
+
|
91 |
+
# PyBuilder
|
92 |
+
target/
|
93 |
+
|
94 |
+
# Jupyter Notebook
|
95 |
+
.ipynb_checkpoints
|
96 |
+
|
97 |
+
# IPython
|
98 |
+
profile_default/
|
99 |
+
ipython_config.py
|
100 |
+
|
101 |
+
# pyenv
|
102 |
+
.python-version
|
103 |
+
|
104 |
+
# pipenv
|
105 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
106 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
107 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
108 |
+
# install all needed dependencies.
|
109 |
+
#Pipfile.lock
|
110 |
+
|
111 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
112 |
+
__pypackages__/
|
113 |
+
|
114 |
+
# Celery stuff
|
115 |
+
celerybeat-schedule
|
116 |
+
celerybeat.pid
|
117 |
+
|
118 |
+
# SageMath parsed files
|
119 |
+
*.sage.py
|
120 |
+
|
121 |
+
# Environments
|
122 |
+
.env
|
123 |
+
.venv
|
124 |
+
env/
|
125 |
+
venv/
|
126 |
+
ENV/
|
127 |
+
env.bak/
|
128 |
+
venv.bak/
|
129 |
+
|
130 |
+
# Spyder project settings
|
131 |
+
.spyderproject
|
132 |
+
.spyproject
|
133 |
+
|
134 |
+
# Rope project settings
|
135 |
+
.ropeproject
|
136 |
+
|
137 |
+
# mkdocs documentation
|
138 |
+
/site
|
139 |
+
|
140 |
+
# mypy
|
141 |
+
.mypy_cache/
|
142 |
+
.dmypy.json
|
143 |
+
dmypy.json
|
144 |
+
|
145 |
+
# Pyre type checker
|
146 |
+
.pyre/
|
README.md
CHANGED
@@ -1,12 +1 @@
|
|
1 |
-
|
2 |
-
title: Text Classification
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.32.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# Text Classification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
|
6 |
+
import os
|
7 |
+
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
import shutil
|
11 |
+
|
12 |
+
from trainer import Trainer
|
13 |
+
|
14 |
+
def predict(input_text):
|
15 |
+
predicted_label = trainer.predict(input_text )
|
16 |
+
return str(predicted_label)
|
17 |
+
# pass
|
18 |
+
|
19 |
+
def predict_maxent(input_text):
|
20 |
+
predicted_label = trainer_maxent.predict_maxent(input_text )
|
21 |
+
return str(predicted_label)
|
22 |
+
# pass
|
23 |
+
|
24 |
+
def predict_svm(input_text):
|
25 |
+
predicted_label = trainer_svm.predict_svm(input_text )
|
26 |
+
return str(predicted_label)
|
27 |
+
# pass
|
28 |
+
|
29 |
+
|
30 |
+
def create_demo():
|
31 |
+
|
32 |
+
USAGE = """## Text Classification
|
33 |
+
|
34 |
+
"""
|
35 |
+
|
36 |
+
|
37 |
+
with gr.Blocks() as demo:
|
38 |
+
|
39 |
+
gr.Markdown(USAGE)
|
40 |
+
|
41 |
+
# demo =
|
42 |
+
# gr.Interface(
|
43 |
+
# predict,
|
44 |
+
# # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
|
45 |
+
# gr.File(type="filepath"),
|
46 |
+
# gr.File(type="filepath"),
|
47 |
+
# cache_examples=False
|
48 |
+
# )
|
49 |
+
|
50 |
+
# input_file = gr.File(type="filepath")
|
51 |
+
# output_file = gr.File(type="filepath")
|
52 |
+
|
53 |
+
gr.Interface(fn=predict, inputs="textbox", outputs="textbox")
|
54 |
+
|
55 |
+
gr.Interface(fn=predict_maxent, inputs="textbox", outputs="textbox")
|
56 |
+
|
57 |
+
gr.Interface(fn=predict_svm, inputs="textbox", outputs="textbox")
|
58 |
+
|
59 |
+
# gr.Interface(
|
60 |
+
# predict,
|
61 |
+
# # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
|
62 |
+
# input_file,
|
63 |
+
# output_file,
|
64 |
+
# cache_examples=False
|
65 |
+
# )
|
66 |
+
|
67 |
+
# inputs = input_file
|
68 |
+
# outputs = output_file
|
69 |
+
# gr.Examples(
|
70 |
+
# examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
|
71 |
+
# inputs=inputs,
|
72 |
+
# fn=predict,
|
73 |
+
# outputs=outputs,
|
74 |
+
# )
|
75 |
+
|
76 |
+
|
77 |
+
return demo
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
|
81 |
+
vocab_size = 8000
|
82 |
+
sequence_len = 150
|
83 |
+
|
84 |
+
# batch_size = 1024
|
85 |
+
batch_size = 256
|
86 |
+
nn_epochs = 20
|
87 |
+
model_type = "lstm"
|
88 |
+
|
89 |
+
# model_type = "bilstm"
|
90 |
+
|
91 |
+
trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
92 |
+
|
93 |
+
model_type = "max_ent"
|
94 |
+
trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
95 |
+
|
96 |
+
|
97 |
+
model_type = "svm"
|
98 |
+
trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
demo = create_demo()
|
103 |
+
demo.launch()
|
app_local.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
# import gradio as gr
|
4 |
+
|
5 |
+
|
6 |
+
import os
|
7 |
+
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
import shutil
|
11 |
+
|
12 |
+
from trainer import Trainer
|
13 |
+
|
14 |
+
def predict(input_text, model_type):
|
15 |
+
if model_type in ['lstm', 'bilstm']:
|
16 |
+
predicted_label = trainer.predict(input_text )
|
17 |
+
elif model_type == 'max_ent':
|
18 |
+
predicted_label = trainer.predict_maxent(input_text)
|
19 |
+
elif model_type == 'svm':
|
20 |
+
predicted_label = trainer.predict_svm(input_text)
|
21 |
+
|
22 |
+
return str(predicted_label)
|
23 |
+
# pass
|
24 |
+
|
25 |
+
def predict_omni(input_text, model_type):
|
26 |
+
predicted_label_net = trainer.predict(input_text )
|
27 |
+
predicted_label_maxent = trainer_maxent.predict_maxent(input_text )
|
28 |
+
predicted_label_svm = trainer_svm.predict_svm(input_text )
|
29 |
+
# if model_type in ['lstm', 'bilstm']:
|
30 |
+
# predicted_label = trainer.predict(input_text )
|
31 |
+
# elif model_type == 'max_ent':
|
32 |
+
# predicted_label = trainer.predict_maxent(input_text)
|
33 |
+
# elif model_type == 'svm':
|
34 |
+
# predicted_label = trainer.predict_svm(input_text)
|
35 |
+
predicted_text = f"LSTM: {predicted_label_net}, Max Ent: {predicted_label_maxent}, SVM: {predicted_label_svm}"
|
36 |
+
return predicted_text
|
37 |
+
# pass
|
38 |
+
|
39 |
+
|
40 |
+
def create_demo():
|
41 |
+
|
42 |
+
USAGE = """## Text Classification
|
43 |
+
|
44 |
+
"""
|
45 |
+
|
46 |
+
|
47 |
+
with gr.Blocks() as demo:
|
48 |
+
|
49 |
+
gr.Markdown(USAGE)
|
50 |
+
|
51 |
+
# demo =
|
52 |
+
# gr.Interface(
|
53 |
+
# predict,
|
54 |
+
# # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
|
55 |
+
# gr.File(type="filepath"),
|
56 |
+
# gr.File(type="filepath"),
|
57 |
+
# cache_examples=False
|
58 |
+
# )
|
59 |
+
|
60 |
+
input_file = gr.File(type="filepath")
|
61 |
+
output_file = gr.File(type="filepath")
|
62 |
+
|
63 |
+
gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
|
64 |
+
|
65 |
+
# gr.Interface(
|
66 |
+
# predict,
|
67 |
+
# # gr.Dataframe(type="numpy", datatype="number", row_count=5, col_count=3),
|
68 |
+
# input_file,
|
69 |
+
# output_file,
|
70 |
+
# cache_examples=False
|
71 |
+
# )
|
72 |
+
|
73 |
+
# inputs = input_file
|
74 |
+
# outputs = output_file
|
75 |
+
# gr.Examples(
|
76 |
+
# examples=[os.path.join(os.path.dirname(__file__), "./gradio_inter/20231104_017.pkl")],
|
77 |
+
# inputs=inputs,
|
78 |
+
# fn=predict,
|
79 |
+
# outputs=outputs,
|
80 |
+
# )
|
81 |
+
|
82 |
+
|
83 |
+
return demo
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
|
87 |
+
vocab_size = 8000
|
88 |
+
sequence_len = 150
|
89 |
+
|
90 |
+
# batch_size = 1024
|
91 |
+
batch_size = 256
|
92 |
+
nn_epochs = 20
|
93 |
+
model_type = "lstm"
|
94 |
+
|
95 |
+
# model_type = "bilstm"
|
96 |
+
|
97 |
+
# model_type = "max_ent"
|
98 |
+
|
99 |
+
# trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
100 |
+
# print(f"Trainer loaded")
|
101 |
+
|
102 |
+
|
103 |
+
model_type = "lstm"
|
104 |
+
|
105 |
+
trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
106 |
+
|
107 |
+
model_type = "max_ent"
|
108 |
+
trainer_maxent = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
109 |
+
|
110 |
+
model_type = "svm"
|
111 |
+
trainer_svm = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
112 |
+
|
113 |
+
|
114 |
+
while True:
|
115 |
+
input_text = input()
|
116 |
+
# if model_type in ["lstm", "bilstm"]:
|
117 |
+
# label = predict(input_text, model_type)
|
118 |
+
label = predict_omni(input_text, model_type)
|
119 |
+
# elif model_type in ["max_ent"]:
|
120 |
+
# label =
|
121 |
+
print(label)
|
122 |
+
|
123 |
+
# demo = create_demo()
|
124 |
+
# demo.launch()
|
125 |
+
# python app_local.py
|
lstm_model_new.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import torch.optim as optim
|
6 |
+
from torch.autograd import Variable
|
7 |
+
import torch.distributed as dist
|
8 |
+
|
9 |
+
import math
|
10 |
+
|
11 |
+
|
12 |
+
class LSTMCell(nn.Module):
|
13 |
+
|
14 |
+
def __init__(self, input_size, hidden_size, bias=True):
|
15 |
+
super(LSTMCell, self).__init__()
|
16 |
+
self.input_size = input_size
|
17 |
+
self.hidden_size = hidden_size
|
18 |
+
self.bias = bias
|
19 |
+
self.x2h = nn.Linear(input_size, 4 * hidden_size, bias=bias)
|
20 |
+
self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
|
21 |
+
self.reset_parameters()
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
def reset_parameters(self):
|
26 |
+
std = 1.0 / math.sqrt(self.hidden_size)
|
27 |
+
for w in self.parameters():
|
28 |
+
w.data.uniform_(-std, std)
|
29 |
+
|
30 |
+
def forward(self, x, hidden):
|
31 |
+
|
32 |
+
hx, cx = hidden
|
33 |
+
|
34 |
+
x = x.view(-1, x.size(1))
|
35 |
+
|
36 |
+
gates = self.x2h(x) + self.h2h(hx)
|
37 |
+
|
38 |
+
# print(f"gates: {gates.shape}")
|
39 |
+
|
40 |
+
# gates = gates.squeeze()
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
|
45 |
+
|
46 |
+
ingate = F.sigmoid(ingate)
|
47 |
+
forgetgate = F.sigmoid(forgetgate)
|
48 |
+
cellgate = F.tanh(cellgate)
|
49 |
+
outgate = F.sigmoid(outgate)
|
50 |
+
|
51 |
+
|
52 |
+
cy = torch.mul(cx, forgetgate) + torch.mul(ingate, cellgate)
|
53 |
+
|
54 |
+
hy = torch.mul(outgate, F.tanh(cy))
|
55 |
+
|
56 |
+
return (hy, cy)
|
57 |
+
|
58 |
+
class LSTMModel(nn.Module):
|
59 |
+
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
|
60 |
+
super(LSTMModel, self).__init__()
|
61 |
+
# Hidden dimensions
|
62 |
+
self.hidden_dim = hidden_dim
|
63 |
+
|
64 |
+
# Number of hidden layers
|
65 |
+
self.layer_dim = layer_dim
|
66 |
+
|
67 |
+
self.lstm = LSTMCell(input_dim, hidden_dim, layer_dim)
|
68 |
+
|
69 |
+
self.fc = nn.Linear(hidden_dim, output_dim)
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
def forward(self, x):
|
74 |
+
|
75 |
+
# Initialize hidden state with zeros
|
76 |
+
#######################
|
77 |
+
# USE GPU FOR MODEL #
|
78 |
+
#######################
|
79 |
+
#print(x.shape,"x.shape")100, 28, 28
|
80 |
+
if torch.cuda.is_available():
|
81 |
+
h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
|
82 |
+
else:
|
83 |
+
h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
|
84 |
+
|
85 |
+
# Initialize cell state
|
86 |
+
if torch.cuda.is_available():
|
87 |
+
c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
|
88 |
+
else:
|
89 |
+
c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
outs = []
|
94 |
+
|
95 |
+
cn = c0[0,:,:]
|
96 |
+
hn = h0[0,:,:]
|
97 |
+
|
98 |
+
for seq in range(x.size(1)):
|
99 |
+
hn, cn = self.lstm(x[:,seq,:], (hn,cn))
|
100 |
+
outs.append(hn)
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
out = outs[-1] # .squeeze()
|
105 |
+
|
106 |
+
out = self.fc(out)
|
107 |
+
# out.size() --> 100, 10
|
108 |
+
return out
|
109 |
+
|
110 |
+
|
111 |
+
class LSTM_model(nn.Module):
|
112 |
+
def __init__(self, vocab_size, n_hidden):
|
113 |
+
super(LSTM_model, self).__init__()
|
114 |
+
|
115 |
+
self.embedding = nn.Embedding(vocab_size, n_hidden)
|
116 |
+
|
117 |
+
|
118 |
+
self.lstm = LSTMModel(n_hidden, n_hidden, n_hidden, n_hidden)
|
119 |
+
self.fc_output = nn.Linear(n_hidden, 1)
|
120 |
+
|
121 |
+
|
122 |
+
self.loss = nn.BCEWithLogitsLoss()
|
123 |
+
|
124 |
+
def forward(self, X, t, train=True):
|
125 |
+
|
126 |
+
embed = self.embedding(X) # batch_size, time_steps, features
|
127 |
+
no_of_timesteps = embed.shape[1]
|
128 |
+
n_hidden = embed.shape[2]
|
129 |
+
|
130 |
+
input = embed
|
131 |
+
|
132 |
+
# print(f"input: {input.shape}")
|
133 |
+
|
134 |
+
fc_out = self.lstm(input) ## bsz x nnhidden_dim
|
135 |
+
|
136 |
+
# print(f"fc_out: {fc_out.size()}")
|
137 |
+
h = self.fc_output(fc_out)
|
138 |
+
# print(f"h: {h.size()}")
|
139 |
+
|
140 |
+
return self.loss(h[:, 0], t), h[:, 0]
|
141 |
+
|
142 |
+
class BiLSTM(nn.Module):
|
143 |
+
def __init__(self, input_size, hidden_size, bias=True):
|
144 |
+
super(BiLSTM, self).__init__()
|
145 |
+
self.forward_cell = LSTMCell(input_size, hidden_size, bias)
|
146 |
+
self.backward_cell = LSTMCell(input_size, hidden_size, bias)
|
147 |
+
|
148 |
+
def forward(self, input_seq):
|
149 |
+
forward_outputs = []
|
150 |
+
backward_outputs = []
|
151 |
+
|
152 |
+
forward_hidden = (torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device),
|
153 |
+
torch.zeros(input_seq.size(0), self.forward_cell.hidden_size).to(input_seq.device))
|
154 |
+
backward_hidden = (torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device),
|
155 |
+
torch.zeros(input_seq.size(0), self.backward_cell.hidden_size).to(input_seq.device))
|
156 |
+
|
157 |
+
for t in range(input_seq.size(1)):
|
158 |
+
forward_hidden = self.forward_cell(input_seq[:, t], forward_hidden)
|
159 |
+
forward_outputs.append(forward_hidden[0])
|
160 |
+
|
161 |
+
for t in range(input_seq.size(1)-1, -1, -1):
|
162 |
+
backward_hidden = self.backward_cell(input_seq[:, t], backward_hidden)
|
163 |
+
backward_outputs.append(backward_hidden[0])
|
164 |
+
|
165 |
+
forward_outputs = torch.stack(forward_outputs, dim=1)
|
166 |
+
backward_outputs = torch.stack(backward_outputs, dim=1)
|
167 |
+
|
168 |
+
outputs = torch.cat((forward_outputs, backward_outputs), dim=2)
|
169 |
+
|
170 |
+
return outputs
|
171 |
+
|
172 |
+
class BiLSTMModel(nn.Module):
|
173 |
+
def __init__(self, vocab_size, n_hidden):
|
174 |
+
super(BiLSTMModel, self).__init__()
|
175 |
+
|
176 |
+
self.embedding = nn.Embedding(vocab_size, n_hidden)
|
177 |
+
self.bilstm = BiLSTM(n_hidden, n_hidden)
|
178 |
+
self.fc_output = nn.Linear(2*n_hidden, 1)
|
179 |
+
self.loss = nn.BCEWithLogitsLoss()
|
180 |
+
|
181 |
+
def forward(self, X, t, train=True):
|
182 |
+
embed = self.embedding(X) # batch_size, time_steps, features
|
183 |
+
no_of_timesteps = embed.shape[1]
|
184 |
+
n_hidden = embed.shape[2]
|
185 |
+
|
186 |
+
input = embed
|
187 |
+
bilstm_out = self.bilstm(input) ## bsz x nnhidden_dim
|
188 |
+
bilstm_out = bilstm_out[:, -1, :]
|
189 |
+
h = self.fc_output(bilstm_out)
|
190 |
+
# print(f"bilstm_out: {bilstm_out.shape}, h: {h.shape}, t: {t.shape}")
|
191 |
+
return self.loss(h[:,0], t), h[:, 0]
|
192 |
+
|
193 |
+
|
max_ent_model.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
import math
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
class MaxEntropyModel:
|
12 |
+
|
13 |
+
def __init__(self, ):
|
14 |
+
self.train_set = []
|
15 |
+
self.features = {}
|
16 |
+
self.labels = {}
|
17 |
+
self.labels = {
|
18 |
+
'1': 1, '2': 1
|
19 |
+
}
|
20 |
+
|
21 |
+
def load_data(self, fn):
|
22 |
+
with open(fn, "r") as rf:
|
23 |
+
for line in rf:
|
24 |
+
label, review = line.strip().split(',')
|
25 |
+
label = label[1: -1]
|
26 |
+
review = review.split(' ')
|
27 |
+
fields = [str(int(label))] + review
|
28 |
+
if review != '':
|
29 |
+
label = str(int(label))
|
30 |
+
self.labels[label] = 1
|
31 |
+
for s in set(fields[1:]):
|
32 |
+
if (label, s) not in self.features:
|
33 |
+
self.features[(label, s)] = 1
|
34 |
+
else:
|
35 |
+
self.features[(label, s)] += 1
|
36 |
+
self.train_set.append(fields)
|
37 |
+
rf.close()
|
38 |
+
|
39 |
+
def initialize_parameters(self, ):
|
40 |
+
self.train_set_size = len(self.train_set)
|
41 |
+
self.M = max([len(record)-1 for record in self.train_set])
|
42 |
+
self.ep = [0.0 for _ in range(len(self.features))]
|
43 |
+
|
44 |
+
for i_f, feat in enumerate(self.features):
|
45 |
+
self.ep[i_f] = float(self.features[feat]) / float(self.train_set_size)
|
46 |
+
self.features[feat] = i_f
|
47 |
+
|
48 |
+
self.weights = [0.0 for _ in range(len(self.features))]
|
49 |
+
self.last_weights = self.weights
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
def get_prob_weight(self, features, label):
|
54 |
+
weight = 0.0
|
55 |
+
for feat in features:
|
56 |
+
# print(label, feat)
|
57 |
+
if (label, feat) in self.features:
|
58 |
+
weight += self.weights[self.features[(label, feat)]]
|
59 |
+
prob_weight = math.exp(weight)
|
60 |
+
# print(f"label: {label}, prob_weight: {prob_weight}")
|
61 |
+
return prob_weight
|
62 |
+
|
63 |
+
def get_expected_features(self, ):
|
64 |
+
expected_features = [0.0 for _ in range(len(self.features))]
|
65 |
+
for record in self.train_set:
|
66 |
+
features = record[1:]
|
67 |
+
prob = self.calculate_probability(features)
|
68 |
+
for feat in features:
|
69 |
+
for w, l in prob:
|
70 |
+
if (l, feat) in self.features:
|
71 |
+
idx = self.features[(l, feat)]
|
72 |
+
expected_features[idx] += w * (1.0 / self.train_set_size)
|
73 |
+
return expected_features
|
74 |
+
|
75 |
+
def calculate_probability(self, features):
|
76 |
+
weights = [(self.get_prob_weight(features, l), l) for l in self.labels]
|
77 |
+
tot_weights = [w for w, l in weights]
|
78 |
+
|
79 |
+
Z = sum(tot_weights)
|
80 |
+
|
81 |
+
prob = [(w / Z, l) for w, l in weights]
|
82 |
+
return prob
|
83 |
+
|
84 |
+
def train(self, max_iter=10000):
|
85 |
+
self.initialize_parameters()
|
86 |
+
for i in range(max_iter):
|
87 |
+
print(f"[Training] iter {i + 1} ...")
|
88 |
+
self.new_ep = self.get_expected_features()
|
89 |
+
self.last_weights = self.weights[:]
|
90 |
+
for i, w in enumerate(self.weights):
|
91 |
+
delta = 1.0 / self.M * math.log(self.ep[i] / self.new_ep[i])
|
92 |
+
self.weights[i] = self.weights[i] + delta
|
93 |
+
if i % 10 == 0:
|
94 |
+
test_data_path = "../preprocessed_data/yelp_test.txt"
|
95 |
+
print(f"Start testing...")
|
96 |
+
self.test(test_data_path)
|
97 |
+
|
98 |
+
def test(self, test_data_path):
|
99 |
+
f = open(file=test_data_path)
|
100 |
+
tot_test_nn = 0
|
101 |
+
correct_test_nn = 0
|
102 |
+
for line in f:
|
103 |
+
label, review = line.strip().split(',')
|
104 |
+
label = label[1: -1]
|
105 |
+
review = review.split(' ')
|
106 |
+
|
107 |
+
# fields = [str(int(label))] + review ## get split review ## #
|
108 |
+
|
109 |
+
# input text: review #
|
110 |
+
# output: label #
|
111 |
+
# review #
|
112 |
+
|
113 |
+
prob = self.calculate_probability(review)
|
114 |
+
prob.sort(reverse=True)
|
115 |
+
print(label, prob)
|
116 |
+
|
117 |
+
##### Calculate whether the prediction is correct #####
|
118 |
+
maxx_prob_idx = int(prob[0][1])
|
119 |
+
label_idx = int(label)
|
120 |
+
if maxx_prob_idx == label_idx:
|
121 |
+
correct_test_nn += 1
|
122 |
+
tot_test_nn += 1
|
123 |
+
##### Calculate whether the prediction is correct #####
|
124 |
+
|
125 |
+
f.close()
|
126 |
+
acc = float(correct_test_nn) / float(tot_test_nn)
|
127 |
+
print(f"[Test] Acc: {acc}")
|
128 |
+
|
129 |
+
def save_ckpt(self, sv_ckpt_path):
|
130 |
+
sv_features = self.features
|
131 |
+
sv_weights = self.last_weights
|
132 |
+
sv_ckpt = {
|
133 |
+
'features': sv_features,
|
134 |
+
'weights': sv_weights
|
135 |
+
}
|
136 |
+
np.save(sv_ckpt_path, sv_ckpt)
|
137 |
+
print(f"ckpt with features and weights saved to {sv_ckpt_path}")
|
138 |
+
|
139 |
+
|
pre-requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pip==23.3.2
|
2 |
+
# torch==2.2.0
|
3 |
+
-i https://download.pytorch.org/whl/cpu
|
4 |
+
torch==2.2.0
|
requirements.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
2 |
+
# -f https://data.pyg.org/whl/torch-2.2.0%2Bcpu.html
|
3 |
+
# -i https://download.pytorch.org/whl/cpu
|
4 |
+
# pip==20.2.4
|
5 |
+
# torch==2.2.0
|
6 |
+
# torchvision==0.13.1
|
7 |
+
# torchaudio==0.12.1
|
8 |
+
|
9 |
+
tqdm
|
10 |
+
nltk
|
11 |
+
scikit-learn
|
12 |
+
scipy
|
13 |
+
|
14 |
+
|
15 |
+
# blobfile==2.0.1
|
16 |
+
# manopth @ git+https://github.com/hassony2/manopth.git
|
17 |
+
# numpy==1.23.1
|
18 |
+
# psutil==5.9.2
|
19 |
+
# scikit-learn
|
20 |
+
# scipy==1.9.3
|
21 |
+
# tensorboard
|
22 |
+
# tensorboardx
|
23 |
+
# tqdm
|
24 |
+
# trimesh
|
25 |
+
# clip
|
26 |
+
# chumpy
|
27 |
+
# opencv-python
|
svm_model.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import re
|
3 |
+
import time
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
# from sklearn.linear_model import LogisticRegression
|
7 |
+
# from sklearn.svm import SVC
|
8 |
+
import ssl
|
9 |
+
import os
|
10 |
+
import nltk
|
11 |
+
try:
|
12 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
13 |
+
except AttributeError:
|
14 |
+
pass
|
15 |
+
else:
|
16 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
17 |
+
# print(f"nltk version: {nltk.__version__}")
|
18 |
+
# nltk.download('stopwords')
|
19 |
+
|
20 |
+
#
|
21 |
+
class SVMModel:
|
22 |
+
def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=1000):
|
23 |
+
self.learning_rate = learning_rate
|
24 |
+
self.lambda_param = lambda_param
|
25 |
+
self.n_iters = n_iters
|
26 |
+
self.w = None
|
27 |
+
self.b = None
|
28 |
+
|
29 |
+
self.X_train = None
|
30 |
+
self.X_test = None
|
31 |
+
self.y_train = None
|
32 |
+
self.y_test = None
|
33 |
+
|
34 |
+
def fit(self, X, y):
|
35 |
+
n_samples, n_features = X.shape
|
36 |
+
y_ = np.where(y <= 0, -1, 1) # Convert labels to -1 and 1
|
37 |
+
|
38 |
+
print(f"y_ max: {np.max(y_)}, y_ min: {np.min(y_)}")
|
39 |
+
|
40 |
+
self.w = np.zeros(n_features)
|
41 |
+
self.b = 0
|
42 |
+
|
43 |
+
self.lambda_param = 1.0 / float(n_samples)
|
44 |
+
|
45 |
+
for _ in range(self.n_iters):
|
46 |
+
print(f"Epoch: {_}")
|
47 |
+
for idx, x_i in enumerate(X):
|
48 |
+
condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
|
49 |
+
if condition:
|
50 |
+
self.w = self.w - self.learning_rate * (2 * self.lambda_param * self.w)
|
51 |
+
else:
|
52 |
+
self.w = self.w - self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
|
53 |
+
self.b = self.b - self.learning_rate * y_[idx]
|
54 |
+
if _ % 1 == 0:
|
55 |
+
# print(f"Iteration: {_}")
|
56 |
+
st_time = time.time()
|
57 |
+
self.test()
|
58 |
+
print(f"Time: {time.time() - st_time}")
|
59 |
+
|
60 |
+
def predict(self, X):
|
61 |
+
|
62 |
+
linear_output = np.matmul(X, self.w[:, None]) - self.b # []
|
63 |
+
return np.sign(linear_output[:, 0])
|
64 |
+
|
65 |
+
def test(self, ):
|
66 |
+
# test_ours(self, ):
|
67 |
+
linear_output = self.predict(self.X_test)
|
68 |
+
print(f"linear_output: {linear_output.shape}, self.X_test: {self.X_test.shape}")
|
69 |
+
acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
|
70 |
+
print(f"Test Acc: {acc}")
|
71 |
+
return linear_output
|
72 |
+
|
73 |
+
# weights_dict = self.svm_model.get_weights_dict()
|
74 |
+
def get_weights_dict(self, ):
|
75 |
+
weights_dict = {
|
76 |
+
'w': self.w,
|
77 |
+
'b': self.b
|
78 |
+
}
|
79 |
+
return weights_dict
|
80 |
+
|
81 |
+
class SVM:
|
82 |
+
def __init__(self, ):
|
83 |
+
# file_path =
|
84 |
+
self.x_train = []
|
85 |
+
self.y_train = []
|
86 |
+
self.x_test = []
|
87 |
+
self.y_test = []
|
88 |
+
|
89 |
+
self.data_folder = '.'
|
90 |
+
|
91 |
+
print(f"Start loading data")
|
92 |
+
self._load_data()
|
93 |
+
|
94 |
+
print(f"Setting vectorizer")
|
95 |
+
self.vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
|
96 |
+
|
97 |
+
print(f"Start preprocessing data")
|
98 |
+
self._preprocess_data()
|
99 |
+
|
100 |
+
# self.setup_model()
|
101 |
+
self.setup_model_ours()
|
102 |
+
|
103 |
+
pass
|
104 |
+
|
105 |
+
def _load_data(self, ):
|
106 |
+
|
107 |
+
file_path = '.'
|
108 |
+
x_train = []
|
109 |
+
y_train = []
|
110 |
+
with open(os.path.join(self.data_folder, 'train.csv'), "r") as f:
|
111 |
+
for line in f:
|
112 |
+
l = line.strip().split(',')
|
113 |
+
senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
|
114 |
+
x_train.append(text)
|
115 |
+
y_train.append(int(senti[1]) - 1)
|
116 |
+
f.close()
|
117 |
+
|
118 |
+
x_test = []
|
119 |
+
y_test = []
|
120 |
+
with open(os.path.join(self.data_folder, 'test.csv'), "r") as f:
|
121 |
+
for line in f:
|
122 |
+
l = line.strip().split(',')
|
123 |
+
senti, text = l[0], re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(l[1:]))).lower()
|
124 |
+
x_test.append(text)
|
125 |
+
y_test.append(int(senti[1]) - 1)
|
126 |
+
f.close()
|
127 |
+
self.x_train = x_train
|
128 |
+
self.x_test = x_test
|
129 |
+
self.y_train = np.array(y_train, dtype=np.int32)
|
130 |
+
self.y_test = np.array(y_test, dtype=np.int32)
|
131 |
+
print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
|
132 |
+
|
133 |
+
def _preprocess_data(self, ):
|
134 |
+
self.X_train = self.vectorizer.fit_transform(self.x_train).toarray()
|
135 |
+
self.X_test = self.vectorizer.transform(self.x_test).toarray()
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
def setup_model_ours(self, ):
|
141 |
+
self.svm_model = SVMModel()
|
142 |
+
|
143 |
+
def train_ours(self, ):
|
144 |
+
self.y_train = self.y_train.astype(np.float32)
|
145 |
+
self.y_test = self.y_test.astype(np.float32)
|
146 |
+
self.y_train = self.y_train * 2 - 1.0
|
147 |
+
self.y_test = self.y_test * 2 - 1.0
|
148 |
+
|
149 |
+
print(f"max_y_train: {np.max(self.y_train)}, min_y_train: {np.min(self.y_train)}")
|
150 |
+
|
151 |
+
self.svm_model.X_train = self.X_train
|
152 |
+
self.svm_model.X_test = self.X_test
|
153 |
+
self.svm_model.y_train = self.y_train
|
154 |
+
self.svm_model.y_test = self.y_test
|
155 |
+
|
156 |
+
self.svm_model.fit(self.X_train, self.y_train)
|
157 |
+
|
158 |
+
def test_ours(self, ):
|
159 |
+
linear_output = self.svm_model.test()
|
160 |
+
acc = np.mean((linear_output == np.sign(self.y_test)).astype(np.float32))
|
161 |
+
print(f"Test Acc: {acc}")
|
162 |
+
|
163 |
+
weights_dict = self.svm_model.get_weights_dict()
|
164 |
+
np.save("svm_weights.npy", weights_dict)
|
165 |
+
print(f"svm weights saved to svm_weights.npy")
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
# def setup_model(self, ):
|
170 |
+
# self.svc = SVC()
|
171 |
+
|
172 |
+
# def train(self, ):
|
173 |
+
# self.svc.fit(self.X_train, self.y_train)
|
174 |
+
|
175 |
+
# def test(self, ):
|
176 |
+
# self.train_acc = self.svc.score(self.X_train, self.y_train)
|
177 |
+
# self.test_acc = self.svc.score(self.X_test, self.y_test)
|
178 |
+
|
179 |
+
# print(f'Train Acc: {self.train_acc * 100}\n', f'Test Acc: {self.test_acc * 100}\n')
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
# CUDA_VISIBLE_DEVICES=2 python log_reg.py
|
184 |
+
|
185 |
+
# y_train = np.asarray(y_train)
|
186 |
+
# y_test = np.asarray(y_test)
|
187 |
+
|
188 |
+
# print(f"After getting data")
|
189 |
+
|
190 |
+
# start_time = time.time()
|
191 |
+
# vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
|
192 |
+
|
193 |
+
# print(f"After setting the vectorizer")
|
194 |
+
# X_train = vectorizer.fit_transform(x_train).toarray()
|
195 |
+
# X_test = vectorizer.transform(x_test).toarray()
|
196 |
+
|
197 |
+
# print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
|
198 |
+
|
199 |
+
# # lr_classfier = LogisticRegression()
|
200 |
+
# # lr_classfier.fit(X_train,y_train)
|
201 |
+
# # train_acc = lr_classfier.score(X_train,y_train)
|
202 |
+
# # test_acc = lr_classfier.score(X_test,y_test)
|
203 |
+
|
204 |
+
# svc = SVC()
|
205 |
+
# svc.fit(X_train,y_train)
|
206 |
+
# train_acc = svc.score(X_train,y_train)
|
207 |
+
# test_acc = svc.score(X_test,y_test)
|
208 |
+
|
209 |
+
# print('Train Acc: %.2f' % float(train_acc*100), 'Test Acc: %.2f' % float(test_acc*100),'Time: %.4f' % float(time.time()-start_time))
|
210 |
+
# # CUDA_VISIBLE_DEVICES=2 python log_reg.py
|
trainer.py
ADDED
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
# import torch.nn.functional as F
|
5 |
+
import torch.optim as optim
|
6 |
+
# from torch.autograd import Variable
|
7 |
+
#import torch.distributed as dist
|
8 |
+
|
9 |
+
# import time
|
10 |
+
import os
|
11 |
+
import re
|
12 |
+
# import sys
|
13 |
+
# import io
|
14 |
+
|
15 |
+
from tqdm import tqdm
|
16 |
+
import nltk
|
17 |
+
|
18 |
+
from lstm_model_new import LSTM_model, BiLSTMModel
|
19 |
+
from max_ent_model import MaxEntropyModel
|
20 |
+
from svm_model import SVM
|
21 |
+
|
22 |
+
|
23 |
+
class Trainer(object):
|
24 |
+
def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type):
|
25 |
+
|
26 |
+
# vocab_size = 8000
|
27 |
+
# sequence_len = 150
|
28 |
+
|
29 |
+
self.vocab_size = vocab_size
|
30 |
+
self.vocab_sizeb = self.vocab_size + 1
|
31 |
+
|
32 |
+
self.sequence_len = sequence_len
|
33 |
+
self.model_type = model_type
|
34 |
+
|
35 |
+
self.batch_size = batch_size
|
36 |
+
self.nn_epochs = nn_epochs
|
37 |
+
|
38 |
+
self.processed_data_folder = "../preprocessed_data/"
|
39 |
+
|
40 |
+
self._load_data()
|
41 |
+
|
42 |
+
self._get_model()
|
43 |
+
|
44 |
+
# self._setup_optimizer()
|
45 |
+
|
46 |
+
|
47 |
+
pass
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
def _load_data(self, ):
|
52 |
+
|
53 |
+
dict_fn = "yelp_dictionary.npy"
|
54 |
+
|
55 |
+
id_to_word = np.load(dict_fn, allow_pickle=True) # .item()
|
56 |
+
|
57 |
+
print(type(id_to_word))
|
58 |
+
print(id_to_word[0], len(id_to_word))
|
59 |
+
|
60 |
+
word_to_id = {
|
61 |
+
id_to_word[idx]: idx for idx in range(len(id_to_word))
|
62 |
+
}
|
63 |
+
|
64 |
+
# word_to_id = {v: k for k, v in id_to_word.items()}
|
65 |
+
self.word_to_id = word_to_id
|
66 |
+
|
67 |
+
# x_train = np.load('../preprocessed_data/x_train.npy')
|
68 |
+
# y_train = np.load('../preprocessed_data/y_train.npy')
|
69 |
+
|
70 |
+
# #x_train = x_train[:10000]
|
71 |
+
# #y_train = y_train[:10000]
|
72 |
+
# x_test = np.load('../preprocessed_data/x_test.npy')
|
73 |
+
# y_test = np.load('../preprocessed_data/y_test.npy')
|
74 |
+
|
75 |
+
|
76 |
+
# x_train_path = os.path.join(self.processed_data_folder, "x_train.npy")
|
77 |
+
# y_train_path = os.path.join(self.processed_data_folder, "y_train.npy")
|
78 |
+
# x_test_path = os.path.join(self.processed_data_folder, "x_test.npy")
|
79 |
+
# y_test_path = os.path.join(self.processed_data_folder, "y_test.npy")
|
80 |
+
|
81 |
+
# x_train = np.load(x_train_path)
|
82 |
+
# y_train = np.load(y_train_path)
|
83 |
+
# x_test = np.load(x_test_path)
|
84 |
+
# y_test = np.load(y_test_path)
|
85 |
+
# self.x_train = x_train
|
86 |
+
# self.y_train = y_train
|
87 |
+
# self.x_test = x_test
|
88 |
+
# self.y_test = y_test
|
89 |
+
|
90 |
+
def _get_model(self, ):
|
91 |
+
if self.model_type == "lstm":
|
92 |
+
self.model = LSTM_model(self.vocab_sizeb, 800)
|
93 |
+
elif self.model_type == "bilstm":
|
94 |
+
self.model = BiLSTMModel(self.vocab_sizeb, 800)
|
95 |
+
elif self.model_type == "max_ent":
|
96 |
+
self.model = MaxEntropyModel()
|
97 |
+
elif self.model_type == "svm":
|
98 |
+
self.model = SVM()
|
99 |
+
else:
|
100 |
+
raise ValueError("Model type not supported")
|
101 |
+
|
102 |
+
# self.model.cuda()
|
103 |
+
|
104 |
+
if self.model_type in ['lstm', 'bilstm']:
|
105 |
+
# self.model = self.model.cuda()
|
106 |
+
|
107 |
+
model_ckpt_fn = f"{self.model_type}.pth"
|
108 |
+
self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu')))
|
109 |
+
elif self.model_type in ['max_ent']:
|
110 |
+
model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent #
|
111 |
+
model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
|
112 |
+
features = model_params["features"]
|
113 |
+
weights = model_params["weights"]
|
114 |
+
|
115 |
+
self.model.weights = weights # .tolist()
|
116 |
+
# print(f"self.model.weights: {self.model.weights[:10]}")
|
117 |
+
self.model.last_weights = weights # .tolist()
|
118 |
+
|
119 |
+
self.model.features = features
|
120 |
+
# print(f"self.model.features: {list(self.model.features.keys())[:10]}")
|
121 |
+
|
122 |
+
elif self.model_type in ['svm']:
|
123 |
+
model_ckpt_fn = f"{self.model_type}_weights.npy"
|
124 |
+
model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
|
125 |
+
w = model_params['w']
|
126 |
+
b = model_params['b']
|
127 |
+
self.model.svm_model.w = w
|
128 |
+
self.model.svm_model.b = b
|
129 |
+
|
130 |
+
else:
|
131 |
+
raise ValueError("Model type not supported")
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
def _setup_optimizer(self, ):
|
138 |
+
self.lr = 0.001
|
139 |
+
self.opt = optim.Adam(self.model.parameters(), lr=self.lr)
|
140 |
+
|
141 |
+
def _train(self, ):
|
142 |
+
train_losses = []
|
143 |
+
train_accs = []
|
144 |
+
test_accs = [0.0]
|
145 |
+
|
146 |
+
for epoch in range(self.nn_epochs):
|
147 |
+
print(f"Epoch: {epoch}")
|
148 |
+
self.model.train()
|
149 |
+
|
150 |
+
nn_acc = 0
|
151 |
+
nn_total = 0
|
152 |
+
epoch_loss = 0.0
|
153 |
+
|
154 |
+
|
155 |
+
train_permutation_idxes = np.random.permutation(self.y_train.shape[0])
|
156 |
+
|
157 |
+
for i in tqdm(range(0, len(self.y_train), self.batch_size)):
|
158 |
+
batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]]
|
159 |
+
batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]]
|
160 |
+
|
161 |
+
data = torch.from_numpy(batched_x).long().cuda()
|
162 |
+
target = torch.from_numpy(batched_y).float().cuda()
|
163 |
+
|
164 |
+
self.opt.zero_grad()
|
165 |
+
loss, predicted_labels = self.model(data, target)
|
166 |
+
loss.backward()
|
167 |
+
|
168 |
+
norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0)
|
169 |
+
self.opt.step()
|
170 |
+
|
171 |
+
predicted_labels = predicted_labels >= 0
|
172 |
+
gts = target >= 0.5
|
173 |
+
acc = torch.sum((predicted_labels == gts).float()).item()
|
174 |
+
|
175 |
+
nn_acc += acc
|
176 |
+
epoch_loss += loss.item()
|
177 |
+
nn_total += len(batched_y)
|
178 |
+
|
179 |
+
train_acc = float(nn_acc) / float(nn_total)
|
180 |
+
train_loss = epoch_loss / float(self.batch_size)
|
181 |
+
|
182 |
+
train_losses.append(train_loss)
|
183 |
+
train_accs.append(train_acc)
|
184 |
+
|
185 |
+
print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}")
|
186 |
+
|
187 |
+
self._test()
|
188 |
+
|
189 |
+
|
190 |
+
def _process_text(self, input_text):
|
191 |
+
text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
|
192 |
+
tokens = nltk.word_tokenize(text)
|
193 |
+
token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
|
194 |
+
token_ids = np.array(token_ids)
|
195 |
+
|
196 |
+
token_ids[token_ids > self.vocab_size] = 0
|
197 |
+
if token_ids.shape[0] > self.sequence_len:
|
198 |
+
start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1)
|
199 |
+
token_ids = token_ids[start_index: (start_index + self.sequence_len)]
|
200 |
+
else:
|
201 |
+
token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])])
|
202 |
+
return token_ids
|
203 |
+
|
204 |
+
def _process_text_maxent(self, input_text):
|
205 |
+
text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
|
206 |
+
tokens = nltk.word_tokenize(text)
|
207 |
+
token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
|
208 |
+
# token_ids = np.array(token_ids)
|
209 |
+
token_ids = [ str(word_idx) for word_idx in token_ids ]
|
210 |
+
|
211 |
+
return token_ids
|
212 |
+
|
213 |
+
# token_ids[token_ids > self.vocab_size] = 0
|
214 |
+
# return token_ids
|
215 |
+
|
216 |
+
def _process_text_svm(self, input_text):
|
217 |
+
text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
|
218 |
+
tokens = self.model.vectorizer.transform([text]).toarray()
|
219 |
+
# tokens = nltk.word_tokenize(text)
|
220 |
+
# token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
|
221 |
+
# # token_ids = np.array(token_ids)
|
222 |
+
# token_ids = [ str(word_idx) for word_idx in token_ids ]
|
223 |
+
|
224 |
+
return tokens
|
225 |
+
|
226 |
+
def predict_maxent(self, input_text):
|
227 |
+
|
228 |
+
text_ids = self._process_text_maxent(input_text)
|
229 |
+
|
230 |
+
prob = self.model.calculate_probability(text_ids)
|
231 |
+
prob.sort(reverse=True)
|
232 |
+
# print(label, prob)
|
233 |
+
print(prob)
|
234 |
+
##### Calculate whether the prediction is correct #####
|
235 |
+
maxx_prob_idx = int(prob[0][1])
|
236 |
+
|
237 |
+
# data = torch.from_numpy(text_ids).long() # .cuda()
|
238 |
+
# data = data.unsqueeze(0)
|
239 |
+
|
240 |
+
|
241 |
+
# target = torch.zeros((data.size(0), ), dtype=torch.float)
|
242 |
+
|
243 |
+
# # print(f"data: {data.shape}, target: {target.shape}")
|
244 |
+
|
245 |
+
# with torch.no_grad():
|
246 |
+
# loss, predicted_labels = self.model(data, target)
|
247 |
+
# predicted_labels = predicted_labels >= 0
|
248 |
+
|
249 |
+
if maxx_prob_idx == 2:
|
250 |
+
return "Positive"
|
251 |
+
else:
|
252 |
+
return "Negative"
|
253 |
+
|
254 |
+
def predict_svm(self, input_text):
|
255 |
+
|
256 |
+
text_ids = self._process_text_svm(input_text)
|
257 |
+
|
258 |
+
predicted_label = self.model.svm_model.predict(text_ids)
|
259 |
+
|
260 |
+
if float(predicted_label[0]) > 0:
|
261 |
+
return "Positive"
|
262 |
+
else:
|
263 |
+
return "Negative"
|
264 |
+
|
265 |
+
# prob = self.model.calculate_probability(text_ids)
|
266 |
+
# prob.sort(reverse=True)
|
267 |
+
# # print(label, prob)
|
268 |
+
# print(prob)
|
269 |
+
# ##### Calculate whether the prediction is correct #####
|
270 |
+
# maxx_prob_idx = int(prob[0][1])
|
271 |
+
|
272 |
+
# # data = torch.from_numpy(text_ids).long() # .cuda()
|
273 |
+
# # data = data.unsqueeze(0)
|
274 |
+
|
275 |
+
|
276 |
+
# # target = torch.zeros((data.size(0), ), dtype=torch.float)
|
277 |
+
|
278 |
+
# # # print(f"data: {data.shape}, target: {target.shape}")
|
279 |
+
|
280 |
+
# # with torch.no_grad():
|
281 |
+
# # loss, predicted_labels = self.model(data, target)
|
282 |
+
# # predicted_labels = predicted_labels >= 0
|
283 |
+
|
284 |
+
# if maxx_prob_idx == 2:
|
285 |
+
# return "Positive"
|
286 |
+
# else:
|
287 |
+
# return "Negative"
|
288 |
+
|
289 |
+
|
290 |
+
def predict(self, input_text):
|
291 |
+
|
292 |
+
text_ids = self._process_text(input_text)
|
293 |
+
|
294 |
+
data = torch.from_numpy(text_ids).long() # .cuda()
|
295 |
+
data = data.unsqueeze(0)
|
296 |
+
|
297 |
+
|
298 |
+
target = torch.zeros((data.size(0), ), dtype=torch.float)
|
299 |
+
|
300 |
+
# print(f"data: {data.shape}, target: {target.shape}")
|
301 |
+
|
302 |
+
with torch.no_grad():
|
303 |
+
loss, predicted_labels = self.model(data, target)
|
304 |
+
predicted_labels = predicted_labels >= 0
|
305 |
+
|
306 |
+
if predicted_labels.item():
|
307 |
+
return "Positive"
|
308 |
+
else:
|
309 |
+
return "Negative"
|
310 |
+
|
311 |
+
# return predicted_labels.item()
|
312 |
+
|
313 |
+
|
314 |
+
def _test(self, ):
|
315 |
+
self.model.eval()
|
316 |
+
|
317 |
+
nn_acc = 0
|
318 |
+
loss = 0
|
319 |
+
|
320 |
+
nn_total = 0
|
321 |
+
|
322 |
+
test_permutation_idxes = np.random.permutation(self.y_test.shape[0])
|
323 |
+
for i in tqdm(range(0, len(self.y_test), self.batch_size)):
|
324 |
+
batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]]
|
325 |
+
batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]]
|
326 |
+
|
327 |
+
data = torch.from_numpy(batched_x).long().cuda()
|
328 |
+
target = torch.from_numpy(batched_y).float().cuda()
|
329 |
+
|
330 |
+
with torch.no_grad():
|
331 |
+
loss, predicted_labels = self.model(data, target)
|
332 |
+
|
333 |
+
predicted_labels = predicted_labels >= 0
|
334 |
+
gts = target >= 0.5
|
335 |
+
acc = torch.sum((predicted_labels == gts).float()).item()
|
336 |
+
|
337 |
+
nn_acc += acc
|
338 |
+
nn_total += len(batched_y)
|
339 |
+
|
340 |
+
acc = float(nn_acc) / float(nn_total)
|
341 |
+
print(f"Test Acc: {acc}")
|
342 |
+
|
343 |
+
if __name__=='__main__':
|
344 |
+
|
345 |
+
vocab_size = 8000
|
346 |
+
sequence_len = 150
|
347 |
+
|
348 |
+
# batch_size = 1024
|
349 |
+
batch_size = 256
|
350 |
+
nn_epochs = 20
|
351 |
+
model_type = "lstm"
|
352 |
+
|
353 |
+
model_type = "bilstm"
|
354 |
+
|
355 |
+
trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
|
356 |
+
trainer._train()
|
357 |
+
|
358 |
+
# CUDA_VISIBLE_DEVICES=0 python trainer.py
|