Ruslan-DS commited on
Commit
60a2954
1 Parent(s): 0b9c14e

Add weights files

Browse files
images/attention_words.jpg ADDED
images/bert_tunnig.jpg ADDED
images/distribution_classes.jpg ADDED
images/funny_dataframe.jpg ADDED
images/lstm_attention.jpg ADDED
images/roc_auc_catboost.jpg ADDED
images/roc_auc_logreg.jpg ADDED
images/umap.jpg ADDED
main.py ADDED
File without changes
models/BertTunning.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ from logreg_model import bert_for_logreg, tokenizer_bert
9
+ from preprocess_bert import preprocess_bert
10
+
11
+ MAX_LEN = 100
12
+
13
+ class BertTunnig(nn.Module):
14
+
15
+ def __init__(self, bert_model):
16
+ super().__init__()
17
+
18
+ self.bert = bert_model
19
+ for weights in self.bert.parameters():
20
+ weights.requires_grad = False
21
+
22
+ self.fc1 = nn.Linear(768, 256)
23
+ self.drop1 = nn.Dropout(p=0.5)
24
+ self.fc2 = nn.Linear(256, 32)
25
+ self.fc_out = nn.Linear(32, 1)
26
+
27
+ def forward(self, x, attention_mask):
28
+
29
+ output = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
30
+ output = self.fc1(output)
31
+
32
+ output_drop = self.drop1(output)
33
+ output = self.fc2(output_drop)
34
+
35
+ output = self.fc_out(output)
36
+
37
+ return torch.sigmoid(output)
38
+
39
+
40
+ model_tunning = BertTunnig(bert_model=bert_for_logreg)
41
+
42
+ model_tunning.load_state_dict(torch.load('best_weights_berttinnug(2).pt'))
43
+
44
+ def predict_2(text):
45
+
46
+ preprocessed_text, attention_mask = preprocess_bert(text, MAX_LEN=MAX_LEN)
47
+ preprocessed_text, attention_mask = torch.tensor(preprocessed_text).unsqueeze(0), torch.tensor([attention_mask])
48
+
49
+ with torch.inference_mode():
50
+
51
+ predict = model_tunning(preprocessed_text, attention_mask=attention_mask).item()
52
+
53
+ return round(predict)
models/LSTM.py ADDED
File without changes
models/LogReg.py ADDED
File without changes
models/datasets/embedding_matrix.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbcae4631c684cea4bef1df946822bdfc66cadddc240ffd39f917f200bb5894a
3
+ size 6643840
models/datasets/vocab_to_int.json ADDED
The diff for this file is too large to render. See raw diff
 
models/preprocess_stage/bert_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from transformers import BertModel, BertTokenizer
4
+
5
+ weights = 'DeepPavlov/rubert-base-cased'
6
+ tokenizer_bert = BertTokenizer.from_pretrained(weights)
7
+ bert_for_logreg = BertModel.from_pretrained(weights)
8
+
9
+
10
+ def preprocess_bert(text, MAX_LEN):
11
+
12
+ tokenized_text = tokenizer_bert.encode(
13
+ text=text,
14
+ add_special_tokens=True,
15
+ truncation=True,
16
+ max_length=MAX_LEN
17
+ )
18
+
19
+ padded_text = np.array(tokenized_text + [0] * (MAX_LEN - len(tokenized_text)))
20
+ attention_mask = np.where(padded_text != 0, 1, 0)
21
+
22
+ return padded_text, attention_mask
models/preprocess_stage/preprocess_lstm.py ADDED
File without changes
models/weights/BertTunnigWeights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21a77e82f2fafc9e5cec46b6494f45dc5edb397c13fea55238eaabaf7832cffd
3
+ size 712320552
models/weights/LSTMBestWeights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:317df198794e02b19bbb9f6cc3201950d773f9719df4391228c125ac101cd323
3
+ size 3375698
models/weights/LogRegBestWeights.sav ADDED
Binary file (6.94 kB). View file
 
pages/classification_reviews.py ADDED
File without changes
pages/results.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.2.0
2
+ attrs==23.1.0
3
+ blinker==1.7.0
4
+ cachetools==5.3.2
5
+ certifi==2023.11.17
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ filelock==3.13.1
9
+ fsspec==2023.12.2
10
+ gitdb==4.0.11
11
+ GitPython==3.1.40
12
+ huggingface-hub==0.19.4
13
+ idna==3.6
14
+ importlib-metadata==6.11.0
15
+ Jinja2==3.1.2
16
+ joblib==1.3.2
17
+ jsonschema==4.20.0
18
+ jsonschema-specifications==2023.11.2
19
+ markdown-it-py==3.0.0
20
+ MarkupSafe==2.1.3
21
+ mdurl==0.1.2
22
+ mpmath==1.3.0
23
+ networkx==3.2.1
24
+ nltk==3.8.1
25
+ numpy==1.26.2
26
+ nvidia-cublas-cu12==12.1.3.1
27
+ nvidia-cuda-cupti-cu12==12.1.105
28
+ nvidia-cuda-nvrtc-cu12==12.1.105
29
+ nvidia-cuda-runtime-cu12==12.1.105
30
+ nvidia-cudnn-cu12==8.9.2.26
31
+ nvidia-cufft-cu12==11.0.2.54
32
+ nvidia-curand-cu12==10.3.2.106
33
+ nvidia-cusolver-cu12==11.4.5.107
34
+ nvidia-cusparse-cu12==12.1.0.106
35
+ nvidia-nccl-cu12==2.18.1
36
+ nvidia-nvjitlink-cu12==12.3.101
37
+ nvidia-nvtx-cu12==12.1.105
38
+ packaging==23.2
39
+ pandas==2.1.4
40
+ Pillow==10.1.0
41
+ protobuf==4.25.1
42
+ pyarrow==14.0.1
43
+ pydeck==0.8.1b0
44
+ Pygments==2.17.2
45
+ python-dateutil==2.8.2
46
+ pytz==2023.3.post1
47
+ PyYAML==6.0.1
48
+ referencing==0.32.0
49
+ regex==2023.10.3
50
+ requests==2.31.0
51
+ rich==13.7.0
52
+ rpds-py==0.13.2
53
+ safetensors==0.4.1
54
+ scikit-learn==1.3.2
55
+ scipy==1.11.4
56
+ six==1.16.0
57
+ smmap==5.0.1
58
+ st-pages==0.4.5
59
+ streamlit==1.29.0
60
+ sympy==1.12
61
+ tenacity==8.2.3
62
+ threadpoolctl==3.2.0
63
+ tokenizers==0.15.0
64
+ toml==0.10.2
65
+ toolz==0.12.0
66
+ torch==2.1.2
67
+ tornado==6.4
68
+ tqdm==4.66.1
69
+ transformers==4.36.1
70
+ triton==2.1.0
71
+ typing_extensions==4.9.0
72
+ tzdata==2023.3
73
+ tzlocal==5.2
74
+ urllib3==2.1.0
75
+ validators==0.22.0
76
+ watchdog==3.0.0
77
+ zipp==3.17.0