ruslanruslanruslan commited on
Commit
60cb352
1 Parent(s): 2f9b6cb

files added

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.title('A multipage application featuring various Natural Language Processing instruments and functions.')
basic_bert_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4deb15105a799cd64d8058d552657b397cda2a9d6b2e34b3b9b63ac897936cf3
3
+ size 265489387
bert_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0c9d5352b8a32df74d754421946fdee6d2d4d8a23598b734dfc950c03067019
3
+ size 265495165
borges.jpg ADDED
borgesian_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:852eff7ff8cb373033d5e4f8e71454a079dd08496dfe4e3db148e65b6d88e6f8
3
+ size 500981765
lstm_embedding_matrix.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e89492f39ddd85531f55a30b36650b4cdbe86cb624588e568e825211f3c3a5
3
+ size 108256384
lstm_model_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04f6ffb4ac2e1897631488a12d707e848c97a616674185a0eb875aab82cceeac
3
+ size 65423143
lstm_vocab_to_int.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a58991e3ae061ed499b316bd5b2c805cd9628b5fc1e1244169fa13ce268547
3
+ size 4414229
pages/Borgesian.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import transformers
3
+ import torch
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
+
6
+ borgesian = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2', output_attentions = False, output_hidden_states = False)
7
+ borgesian.load_state_dict(torch.load('borgesian_weights.pt', map_location=torch.device('cpu')))
8
+ tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
9
+ borgesian.to('cpu')
10
+ borgesian.eval()
11
+
12
+ def generate_response(text, temperature, length, top_p):
13
+ input_ids = tokenizer.encode(text, return_tensors="pt")
14
+ with torch.no_grad():
15
+ out = borgesian.generate(input_ids, do_sample=True, num_beams=2, temperature=float(temperature), top_p=float(top_p), max_length=length)
16
+ generated_text = list(map(tokenizer.decode, out))[0]
17
+ st.write(generated_text)
18
+
19
+ st.title('Borgesian')
20
+ st.image('borges.jpg')
21
+ st.write('Write a prompt in Russian, and the GPT-based model will follow up with a Borgesian text.')
22
+ st.write('Define the parameters of generation:')
23
+ temperature = st.slider('Temperature', value = 1.5, min_value = 1.0, max_value = 5.0, step = 0.1)
24
+ length = st.slider('Length', value = 50, min_value = 20, max_value = 150, step = 1)
25
+ top_p = st.slider('Top-p value', value = 0.9, min_value = 0.5, max_value = 1.0, step = 0.05)
26
+
27
+ user_input = st.text_area("Enter your text:")
28
+ if st.button("Send"):
29
+ if user_input:
30
+ generate_response(user_input, temperature, length, top_p)
31
+ else:
32
+ st.warning("Please enter some text.")
pages/Film reviews classifier.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import os
4
+ import logging
5
+ import torch
6
+ import json
7
+ import string
8
+ import re
9
+ import string
10
+ import nltk
11
+ import numpy as np
12
+ import torch.nn as nn
13
+ import transformers
14
+ from collections import Counter
15
+ from nltk.corpus import stopwords
16
+ from nltk.stem import WordNetLemmatizer
17
+ # stop_words = set(stopwords.words('english'))
18
+
19
+ def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict):
20
+ preprocessed_string = data_preprocessing(input_string)
21
+ result_list = []
22
+ for word in preprocessed_string.split():
23
+ try:
24
+ result_list.append(vocab_to_int[word])
25
+ except KeyError as e:
26
+ continue
27
+ result_padded = padding([result_list], seq_len)[0]
28
+ return torch.tensor(result_padded)
29
+
30
+
31
+
32
+ def padding(reviews_int: list, seq_len: int):
33
+ features = np.zeros((len(reviews_int), seq_len), dtype = int)
34
+ for i, review in enumerate(reviews_int):
35
+ if len(review) <= seq_len:
36
+ zeros = list(np.zeros(seq_len - len(review)))
37
+ new = zeros + review
38
+ else:
39
+ new = review[: seq_len]
40
+ features[i, :] = np.array(new)
41
+ return features
42
+
43
+
44
+ def data_preprocessing(text: str):
45
+ wn_lemmatizer = WordNetLemmatizer()
46
+ text = text.lower()
47
+ text = re.sub('<.*?>', '', text)
48
+ text = ''.join([c for c in text if c not in string.punctuation])
49
+ text = [wn_lemmatizer.lemmatize(word) for word in text.split()] #if word not in stop_words]
50
+ text = ' '.join(text)
51
+ return text
52
+
53
+ with open('lstm_vocab_to_int.json') as json_file:
54
+ vocab_to_int = json.load(json_file)
55
+
56
+ with open('lstm_embedding_matrix.npy', 'rb') as f:
57
+ embedding_matrix = np.load(f)
58
+
59
+ embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
60
+
61
+ class LSTMClassifier(nn.Module):
62
+ def __init__(self, embedding_dim: int, seq_len:int, hidden_size:int = 32, dropout:int = 0, num_layers:int = 1) -> None:
63
+ super().__init__()
64
+
65
+ self.embedding_dim = embedding_dim
66
+ self.hidden_size = hidden_size
67
+ self.embedding = embedding_layer
68
+ self.dropout = dropout
69
+ self.num_layers = num_layers
70
+ self.seq_len = seq_len
71
+ self.lstm = nn.LSTM(
72
+ input_size=self.embedding_dim,
73
+ hidden_size=self.hidden_size,
74
+ batch_first=True,
75
+ bidirectional=True,
76
+ dropout=self.dropout,
77
+ num_layers=self.num_layers
78
+ )
79
+ self.linear = nn.Sequential(
80
+ nn.Linear(self.hidden_size * self.seq_len * 2, 128),
81
+ nn.Linear(128, 1)
82
+ )
83
+
84
+ def forward(self, x):
85
+ embeddings = self.embedding(x)
86
+ output, _ = self.lstm(embeddings)
87
+ output = output.contiguous().view(output.size(0), -1)
88
+ out = self.linear(output.squeeze(0))
89
+ return out
90
+
91
+ bert_model_class = transformers.DistilBertModel
92
+ bert_tokenizer_class = transformers.DistilBertTokenizer
93
+ bert_pretrained_weights = torch.load('basic_bert_weights.pt', map_location=torch.device('cpu'))
94
+ bert_tokenizer = bert_tokenizer_class.from_pretrained('distilbert-base-uncased')
95
+ bert_basic_model = bert_model_class.from_pretrained('distilbert-base-uncased')
96
+
97
+ class BertReviews(nn.Module):
98
+ def __init__(self, model):
99
+ super(BertReviews, self).__init__()
100
+ self.bert = model
101
+ for param in self.bert.parameters():
102
+ param.requires_grad = False
103
+ for i in range(6):
104
+ self.bert.transformer.layer[i].output_layer_norm.weight.requires_grad = True
105
+ self.bert.transformer.layer[i].output_layer_norm.bias.requires_grad = True
106
+ self.fc = nn.Linear(768, 1)
107
+
108
+ def forward(self, samples, att_masks):
109
+
110
+ embeddings = self.bert(samples, attention_mask=att_masks)
111
+ model_out = self.fc(embeddings[0][:, 0, :])
112
+
113
+ return embeddings, model_out
114
+
115
+ bert_model = BertReviews(bert_basic_model)
116
+ bert_model.load_state_dict(torch.load('bert_weights.pt', map_location=torch.device('cpu')))
117
+ bert_model.to('cpu').eval()
118
+
119
+ model_lstm = LSTMClassifier(embedding_dim=64, hidden_size=64, seq_len = 150, dropout=0.5, num_layers=4)
120
+ model_lstm.load_state_dict(torch.load('lstm_model_weights.pt', map_location=torch.device('cpu')))
121
+ model_lstm.to('cpu').eval()
122
+
123
+
124
+
125
+ def predict_sentence_lstm(text: str):
126
+ start_time = time.time()
127
+ text = preprocess_single_string(text, 150, vocab_to_int)
128
+ res = int(torch.sigmoid(model_lstm(text.unsqueeze(0))).cpu().detach().numpy().round())
129
+ end_time = time.time()
130
+ execution_time = end_time - start_time
131
+ return res, execution_time
132
+
133
+ def predict_sentence_bert(text: str):
134
+ start_time = time.time()
135
+ text = bert_tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=200)
136
+ text = np.array([text + [0]*(200-len(text))])
137
+ attention_mask = torch.Tensor(np.where(text != 0, 1, 0)).to(torch.int64)
138
+ text = torch.Tensor(text).to(torch.int64)
139
+ # output = bert_model(text, attention_mask)[1]
140
+ # res = output.squeeze().detach().numpy().round()
141
+
142
+ res = int(torch.sigmoid(bert_model(text, attention_mask)[1]).cpu().detach().numpy().round())
143
+ end_time = time.time()
144
+ execution_time = end_time - start_time
145
+ return res, execution_time
146
+
147
+ reses = {0: 'negative', 1: 'positive'}
148
+
149
+ def process_text(input_text):
150
+ res_lstm, time_lstm = predict_sentence_lstm(input_text)
151
+ res_bert, time_bert = predict_sentence_bert(input_text)
152
+ st.write('Results:')
153
+ st.write(f'LSTM: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
154
+ st.write(f'Upgraded Bert: {reses[res_bert]}, execution time: {time_bert:.2f} seconds.')
155
+
156
+ st.title('Film reviews classifier')
157
+ st.write('Write a film review in a box below, and the application, powered by two NLP models (LSTM and upgraded Bert), will tell if it is a positive or a negative review.')
158
+
159
+ user_input = st.text_area("Enter your text:")
160
+ if st.button("Send a review for processing"):
161
+ if user_input:
162
+ processed_text = process_text(user_input)
163
+ else:
164
+ st.warning("Please enter some text before processing.")
pages/Summarizer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, BartForConditionalGeneration
3
+
4
+ summarizer = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
5
+ tokenizer_sum = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
6
+
7
+ def generate_summary(text, length):
8
+ inputs = tokenizer_sum([text], max_length=1024, return_tensors="pt")
9
+ summary_ids = summarizer.generate(inputs["input_ids"], num_beams=2, min_length=1, max_length=length)
10
+ out = tokenizer_sum.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
11
+ st.write(out)
12
+
13
+ st.title('Summarizer')
14
+ st.write('Submit a news article in the field below, and the Bart-based model with provide a summary.')
15
+
16
+ length = st.slider('Maximum length of summary', value = 50, min_value = 15, max_value = 150, step = 1)
17
+ user_input = st.text_area("Enter your text:")
18
+ if st.button("Send a review for processing"):
19
+ if user_input:
20
+ generate_summary(user_input, length)
21
+ else:
22
+ st.warning("Please enter some text before processing.")
requirements.txt ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.0.1
2
+ attrs==23.1.0
3
+ blinker==1.6.2
4
+ cachetools==5.3.1
5
+ certifi==2023.5.7
6
+ charset-normalizer==3.2.0
7
+ click==8.1.6
8
+ cmake==3.27.0
9
+ contourpy==1.1.0
10
+ cycler==0.11.0
11
+ decorator==5.1.1
12
+ filelock==3.12.2
13
+ fonttools==4.41.0
14
+ fsspec==2023.6.0
15
+ gitdb==4.0.10
16
+ GitPython==3.1.32
17
+ huggingface-hub==0.16.4
18
+ idna==3.4
19
+ importlib-metadata==6.8.0
20
+ Jinja2==3.1.2
21
+ joblib==1.3.1
22
+ jsonschema==4.18.4
23
+ jsonschema-specifications==2023.7.1
24
+ kiwisolver==1.4.4
25
+ lit==16.0.6
26
+ markdown-it-py==3.0.0
27
+ MarkupSafe==2.1.3
28
+ matplotlib==3.7.2
29
+ mdurl==0.1.2
30
+ mpmath==1.3.0
31
+ networkx==3.1
32
+ nltk==3.8.1
33
+ numpy==1.25.1
34
+ nvidia-cublas-cu11==11.10.3.66
35
+ nvidia-cuda-cupti-cu11==11.7.101
36
+ nvidia-cuda-nvrtc-cu11==11.7.99
37
+ nvidia-cuda-runtime-cu11==11.7.99
38
+ nvidia-cudnn-cu11==8.5.0.96
39
+ nvidia-cufft-cu11==10.9.0.58
40
+ nvidia-curand-cu11==10.2.10.91
41
+ nvidia-cusolver-cu11==11.4.0.1
42
+ nvidia-cusparse-cu11==11.7.4.91
43
+ nvidia-nccl-cu11==2.14.3
44
+ nvidia-nvtx-cu11==11.7.91
45
+ packaging==23.1
46
+ pandas==2.0.3
47
+ Pillow==9.5.0
48
+ protobuf==4.23.4
49
+ pyarrow==12.0.1
50
+ pydeck==0.8.1b0
51
+ Pygments==2.15.1
52
+ Pympler==1.0.1
53
+ pyparsing==3.0.9
54
+ python-dateutil==2.8.2
55
+ pytz==2023.3
56
+ pytz-deprecation-shim==0.1.0.post0
57
+ PyYAML==6.0.1
58
+ referencing==0.30.0
59
+ regex==2023.6.3
60
+ requests==2.31.0
61
+ rich==13.4.2
62
+ rpds-py==0.9.2
63
+ safetensors==0.3.1
64
+ six==1.16.0
65
+ smmap==5.0.0
66
+ streamlit==1.24.1
67
+ sympy==1.12
68
+ tenacity==8.2.2
69
+ tokenizers==0.13.3
70
+ toml==0.10.2
71
+ toolz==0.12.0
72
+ torch==2.0.1
73
+ torchutils==0.0.4
74
+ tornado==6.3.2
75
+ tqdm==4.65.0
76
+ transformers==4.31.0
77
+ triton==2.0.0
78
+ typing_extensions==4.7.1
79
+ tzdata==2023.3
80
+ tzlocal==4.3.1
81
+ urllib3==2.0.4
82
+ validators==0.20.0
83
+ watchdog==3.0.0
84
+ zipp==3.16.2