Awlly commited on
Commit
a15e210
1 Parent(s): 67ee02a
__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (2.32 kB). View file
 
app_models/__pycache__/bag_of_words_MODEL.cpython-310.pyc ADDED
Binary file (630 Bytes). View file
 
app_models/__pycache__/gpt_MODEL.cpython-310.pyc ADDED
Binary file (1.08 kB). View file
 
app_models/__pycache__/lstm_MODEL.cpython-310.pyc ADDED
Binary file (3.49 kB). View file
 
app_models/__pycache__/rubert_MODEL.cpython-310.pyc ADDED
Binary file (1.43 kB). View file
 
app_models/__pycache__/toxicity_MODEL.cpython-310.pyc ADDED
Binary file (985 Bytes). View file
 
app_models/bag_of_words_MODEL.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ from preprocessing import data_preprocessing
4
+
5
+ # Load your trained BoW model and vectorizer
6
+ vectorizer_path = 'model_data/bow_vectorizer.joblib'
7
+ model_path = 'model_data/bow_model.joblib'
8
+ vectorizer = joblib.load(vectorizer_path)
9
+ model = joblib.load(model_path)
10
+
11
+ # Streamlit UI
12
+
13
+ def predict(input):
14
+
15
+ processed_text = data_preprocessing(input)
16
+ user_input_bow = vectorizer.transform([processed_text])
17
+ # Make a prediction
18
+ prediction = model.predict(user_input_bow)
19
+ return prediction
20
+ # User text input
app_models/gpt_MODEL.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
2
+ import torch
3
+
4
+ # Load the model and tokenizer
5
+ model_path = '/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/finetuned_model'
6
+ tokenizer = GPT2Tokenizer.from_pretrained(model_path)
7
+ model = GPT2LMHeadModel.from_pretrained(model_path)
8
+
9
+ # Move model to GPU if available
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ model.to(device)
12
+
13
+ def generate_text(prompt_text, length, temperature):
14
+ encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
15
+ encoded_prompt = encoded_prompt.to(device)
16
+
17
+ output_sequences = model.generate(
18
+ input_ids=encoded_prompt,
19
+ max_length=length,
20
+ temperature=temperature,
21
+ top_k=20,
22
+ top_p=0.9,
23
+ repetition_penalty=1.2,
24
+ do_sample=True,
25
+ num_return_sequences=1,
26
+ )
27
+
28
+ # Decode the generated text
29
+ generated_sequence = output_sequences[0].tolist()
30
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
31
+
32
+ # Remove the prompt from the generated text
33
+ text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
34
+
35
+ return text.strip()
36
+
37
+ # Streamlit interface
app_models/lstm_MODEL.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ import torch.nn as nn
4
+ from dataclasses import dataclass
5
+ from preprocessing import preprocess_single_string
6
+
7
+ with open('model_data/vocab_kinopoisk_lstm.json', 'r') as file:
8
+ vocab_to_int = json.load(file)
9
+
10
+ @dataclass
11
+ class ConfigRNN:
12
+ vocab_size: int
13
+ device : str
14
+ n_layers : int
15
+ embedding_dim : int
16
+ hidden_size : int
17
+ seq_len : int
18
+ bidirectional : bool or int
19
+
20
+ net_config = ConfigRNN(
21
+ vocab_size = len(vocab_to_int)+1,
22
+ device='cpu',
23
+ n_layers=3,
24
+ embedding_dim=64,
25
+ hidden_size=64,
26
+ seq_len = 100,
27
+ bidirectional=False
28
+ )
29
+
30
+
31
+ class LSTMClassifier(nn.Module):
32
+ def __init__(self, rnn_conf = net_config) -> None:
33
+ super().__init__()
34
+
35
+ self.embedding_dim = rnn_conf.embedding_dim
36
+ self.hidden_size = rnn_conf.hidden_size
37
+ self.bidirectional = rnn_conf.bidirectional
38
+ self.n_layers = rnn_conf.n_layers
39
+
40
+ self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
41
+ self.lstm = nn.LSTM(
42
+ input_size = self.embedding_dim,
43
+ hidden_size = self.hidden_size,
44
+ bidirectional = self.bidirectional,
45
+ batch_first = True,
46
+ num_layers = self.n_layers
47
+ )
48
+ self.bidirect_factor = 2 if self.bidirectional else 1
49
+ self.clf = nn.Sequential(
50
+ nn.Linear(self.hidden_size * self.bidirect_factor, 32),
51
+ nn.Tanh(),
52
+ nn.Dropout(),
53
+ nn.Linear(32, 3)
54
+ )
55
+
56
+ def model_description(self):
57
+ direction = 'bidirect' if self.bidirectional else 'onedirect'
58
+ return f'lstm_{direction}_{self.n_layers}'
59
+
60
+
61
+ def forward(self, x: torch.Tensor):
62
+ embeddings = self.embedding(x)
63
+ out, _ = self.lstm(embeddings)
64
+ out = out[:, -1, :] # [все элементы батча, последний h_n, все элементы последнего h_n]
65
+ out = self.clf(out.squeeze())
66
+ return out
67
+
68
+
69
+ def load_lstm_model():
70
+ model = LSTMClassifier()
71
+ model.load_state_dict(torch.load('model_data/lstm_model.pth'))
72
+ model.eval()
73
+ return model
74
+ model = load_lstm_model()
75
+
76
+
77
+ def predict_review(review_text, model=model, net_config=net_config, vocab_to_int=vocab_to_int):
78
+ sample = preprocess_single_string(review_text, net_config.seq_len, vocab_to_int)
79
+ model.eval()
80
+ with torch.no_grad():
81
+ output = model(sample.unsqueeze(0)).to(net_config.device)
82
+ if output.dim() == 1:
83
+ output = output.unsqueeze(0) # Adjust if necessary
84
+ _, predicted_class = torch.max(output, dim=1)
85
+ if predicted_class.item() == 0:
86
+ return "Это положительный комментарий! Хорошо, что тебе понравился этот фильм! Можешь перейти в раздел с моделью GPT2 и обсудить с ней фильм!"
87
+ elif predicted_class.item() == 1:
88
+ return "Скорее всего... это комментарий нейтрального характера.. какой-то ты скучный..."
89
+ else:
90
+ return "Ты что такой токсик? Будь сдержанее, не понравился фильм - пройди мимо и не порьт авторам настроение, они же старались!"
app_models/rubert_MODEL.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ import torch
3
+ import numpy as np
4
+ from sklearn.linear_model import LogisticRegression
5
+ import joblib
6
+
7
+
8
+
9
+ # Load RuBERT model and tokenizer
10
+ rubert_model_name = "cointegrated/rubert-tiny2" # Example model name, adjust as needed
11
+ tokenizer = AutoTokenizer.from_pretrained(rubert_model_name)
12
+ model = AutoModel.from_pretrained(rubert_model_name)
13
+
14
+ # Load Logistic Regression model
15
+ logreg_model_path = "/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/logreg_model_v2.joblib"
16
+ logreg_model = joblib.load(logreg_model_path)
17
+
18
+ def embed_bert_cls(text, model, tokenizer):
19
+ """Generate embeddings for input text using the RuBERT model."""
20
+ inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
21
+ with torch.no_grad():
22
+ outputs = model(**inputs)
23
+ embeddings = outputs.last_hidden_state[:, 0, :]
24
+ embeddings = torch.nn.functional.normalize(embeddings)
25
+ return embeddings.cpu().numpy()
26
+
27
+ def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model):
28
+ """Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression."""
29
+ embeddings = embed_bert_cls(text, model, tokenizer)
30
+ prediction = classifier.predict(embeddings)
31
+ dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'}
32
+
33
+ return dict_class[prediction[0]]
app_models/toxicity_MODEL.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+
4
+ model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
5
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
6
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
7
+ if torch.cuda.is_available():
8
+ model.cuda()
9
+
10
+ def text2toxicity(text, aggregate=True):
11
+ """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
12
+ with torch.no_grad():
13
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
14
+ proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
15
+ if isinstance(text, str):
16
+ proba = proba[0]
17
+ if aggregate:
18
+ return 1 - proba.T[0] * (1 - proba.T[-1])
19
+ return proba
20
+
app_pages/__pycache__/page1_model_comparison.cpython-310.pyc ADDED
Binary file (904 Bytes). View file
 
app_pages/__pycache__/page2_rubert_toxicity.cpython-310.pyc ADDED
Binary file (794 Bytes). View file
 
app_pages/__pycache__/page3_gpt_model.cpython-310.pyc ADDED
Binary file (845 Bytes). View file
 
app_pages/page1_model_comparison.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app_models.rubert_MODEL import classify_text
3
+ from app_models.bag_of_words_MODEL import predict
4
+ from app_models.lstm_MODEL import predict_review
5
+
6
+ class_prefix = 'This review is likely...'
7
+
8
+ def run():
9
+ st.title("Movie Review Classification")
10
+ st.write("This page will compare three models: Bag of Words/TF-IDF, LSTM, and BERT.")
11
+
12
+ # Example placeholder for user input
13
+ user_input = st.text_area("")
14
+
15
+ # Placeholder buttons for model selection
16
+ if st.button('Classify with BoW/TF-IDF'):
17
+ st.write(f'{class_prefix}{predict(user_input)}')
18
+ if st.button('Classify with LSTM'):
19
+ st.write(f'{class_prefix}{predict_review(user_input)}')
20
+ if st.button('Classify with ruBERT'):
21
+ st.write(f'{class_prefix}{classify_text(user_input)}')
app_pages/page2_rubert_toxicity.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from app_models.toxicity_MODEL import text2toxicity
4
+
5
+
6
+ def run():
7
+ st.title('Toxicity Detection')
8
+ st.write('This tool classifies text as toxic or non-toxic using RuBERT.')
9
+
10
+ user_input = st.text_area("Enter text to classify", "Type your text here...")
11
+
12
+ if st.button('Classify'):
13
+ toxicity_score = text2toxicity(user_input)
14
+ st.write('Toxicity score:', toxicity_score)
15
+
16
+ # Optional: Interpret the score for the user
17
+ if toxicity_score > 0.5:
18
+ st.write("This text is likely to be considered toxic.")
19
+ else:
20
+ st.write("This text is likely to be considered non-toxic.")
app_pages/page3_gpt_model.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app_models.gpt_MODEL import generate_text
3
+
4
+
5
+ def run():
6
+ st.title('GPT Text Generation')
7
+ prompt_text = st.text_area("Input Text", "Type here...")
8
+ length = st.slider("Length of Generated Text", min_value=50, max_value=500, value=200)
9
+ temperature = st.slider("Temperature", min_value=0.1, max_value=1.0, value=0.7, step=0.1)
10
+
11
+ if st.button('Generate Text'):
12
+ with st.spinner('Generating...'):
13
+ generated_text = generate_text(prompt_text, length, temperature)
14
+ st.text_area("Generated Text", generated_text, height=250)
main_app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+
4
+ from app_pages import page1_model_comparison, page2_rubert_toxicity, page3_gpt_model
5
+
6
+ st.sidebar.title('Navigation')
7
+ selection = st.sidebar.radio("Go to", ["Model Comparison", "RuBERT Toxicity Detection", "GPT Model"])
8
+
9
+ if selection == "Model Comparison":
10
+ page1_model_comparison.run()
11
+ elif selection == "RuBERT Toxicity Detection":
12
+ page2_rubert_toxicity.run()
13
+ elif selection == "GPT Model":
14
+ page3_gpt_model.run()
model_data/bow_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b6a472e5fd4a44099fdde129a19e7fdf4f7c078d88b9fb53bd0ed4508a46ac
3
+ size 3942479
model_data/bow_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d760947efd85a7b54b7f17c20215386cda613e8329a1c55a76cc8e4707faae19
3
+ size 4126902
model_data/finetuned_gpt/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 1,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 0,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.37.2",
39
+ "use_cache": true,
40
+ "vocab_size": 50264
41
+ }
model_data/finetuned_gpt/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.37.2"
7
+ }
model_data/finetuned_gpt/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model_data/finetuned_gpt/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cf4a373f976b99cfdc892f83394728c1d04a62d45b0be7e923fcb9b4128d6ba
3
+ size 500941440
model_data/finetuned_gpt/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
model_data/finetuned_gpt/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": true,
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 2048,
52
+ "pad_token": "<pad>",
53
+ "padding_side": "left",
54
+ "tokenizer_class": "GPT2Tokenizer",
55
+ "truncation_side": "left",
56
+ "trust_remote_code": false,
57
+ "unk_token": "<unk>"
58
+ }
model_data/finetuned_gpt/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model_data/logreg_model_v2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d11a87982a5d6f81eb49df6ceb5640aedefbf692df41f1c03b201c8bd7f032
3
+ size 8383
model_data/lstm_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba752f56cf91b275b17d9e7f774661db66297e074d2cfedf865645dce1045b43
3
+ size 4496930
model_data/vocab_kinopoisk_lstm.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import unicodedata
6
+
7
+ from nltk.corpus import stopwords
8
+ stop_words = set(stopwords.words('russian', 'english'))
9
+
10
+ def data_preprocessing(text: str) -> str:
11
+
12
+ text = text.lower()
13
+ text = text.replace('-', ' ').replace('\n', ' ')
14
+
15
+ text = re.sub('<.*?>', '', text)
16
+ text = ''.join([c for c in text if unicodedata.category(c).startswith(('L', 'N', 'Z')) or c == "'"])
17
+ text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
18
+ text = ' '.join([word for word in text.split() if not word.isdigit()])
19
+ return text
20
+
21
+
22
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
23
+ return list(filter(lambda x: x[1] > n, sorted_words))
24
+
25
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
26
+
27
+ features = np.zeros((len(review_int), seq_len), dtype = int)
28
+ for i, review in enumerate(review_int):
29
+ if len(review) <= seq_len:
30
+ zeros = list(np.zeros(seq_len - len(review)))
31
+ new = zeros + review
32
+ else:
33
+ new = review[: seq_len]
34
+ features[i, :] = np.array(new)
35
+
36
+ return features
37
+
38
+ def preprocess_single_string(
39
+ input_string: str,
40
+ seq_len: int,
41
+ vocab_to_int: dict,
42
+ verbose : bool = False
43
+ ) -> torch.tensor:
44
+
45
+
46
+ preprocessed_string = data_preprocessing(input_string)
47
+ result_list = []
48
+ for word in preprocessed_string.split():
49
+ try:
50
+ result_list.append(vocab_to_int[word])
51
+ except KeyError as e:
52
+ if verbose:
53
+ print(f'{e}: not in dictionary!')
54
+ pass
55
+ result_padded = padding([result_list], seq_len)[0]
56
+
57
+ return torch.tensor(result_padded)
requirements.txt ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.26.1
2
+ altair==5.2.0
3
+ asttokens==2.4.1
4
+ attrs==23.2.0
5
+ blinker==1.7.0
6
+ cachetools==5.3.2
7
+ certifi==2023.11.17
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ comm==0.2.1
11
+ contourpy==1.2.0
12
+ cycler==0.12.1
13
+ debugpy==1.8.0
14
+ decorator==5.1.1
15
+ exceptiongroup==1.2.0
16
+ executing==2.0.1
17
+ filelock==3.13.1
18
+ fonttools==4.47.2
19
+ fsspec==2023.12.2
20
+ gensim==4.3.2
21
+ gitdb==4.0.11
22
+ GitPython==3.1.41
23
+ huggingface-hub==0.20.3
24
+ idna==3.6
25
+ imbalanced-learn==0.12.0
26
+ imblearn==0.0
27
+ importlib-metadata==7.0.1
28
+ ipykernel==6.29.0
29
+ ipython==8.21.0
30
+ jedi==0.19.1
31
+ Jinja2==3.1.3
32
+ joblib==1.3.2
33
+ jsonlines==4.0.0
34
+ jsonschema==4.21.1
35
+ jsonschema-specifications==2023.12.1
36
+ jupyter_client==8.6.0
37
+ jupyter_core==5.7.1
38
+ kiwisolver==1.4.5
39
+ lightning-utilities==0.10.1
40
+ markdown-it-py==3.0.0
41
+ MarkupSafe==2.1.4
42
+ matplotlib==3.8.2
43
+ matplotlib-inline==0.1.6
44
+ mdurl==0.1.2
45
+ mpmath==1.3.0
46
+ nest-asyncio==1.6.0
47
+ networkx==3.2.1
48
+ nltk==3.8.1
49
+ numpy==1.26.3
50
+ nvidia-cublas-cu12==12.1.3.1
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ nvidia-cuda-nvrtc-cu12==12.1.105
53
+ nvidia-cuda-runtime-cu12==12.1.105
54
+ nvidia-cudnn-cu12==8.9.2.26
55
+ nvidia-cufft-cu12==11.0.2.54
56
+ nvidia-curand-cu12==10.3.2.106
57
+ nvidia-cusolver-cu12==11.4.5.107
58
+ nvidia-cusparse-cu12==12.1.0.106
59
+ nvidia-nccl-cu12==2.19.3
60
+ nvidia-nvjitlink-cu12==12.3.101
61
+ nvidia-nvtx-cu12==12.1.105
62
+ packaging==23.2
63
+ pandas==2.2.0
64
+ parso==0.8.3
65
+ pexpect==4.9.0
66
+ pillow==10.2.0
67
+ platformdirs==4.2.0
68
+ prompt-toolkit==3.0.43
69
+ protobuf==4.25.2
70
+ psutil==5.9.8
71
+ ptyprocess==0.7.0
72
+ pure-eval==0.2.2
73
+ pyarrow==15.0.0
74
+ pydeck==0.8.1b0
75
+ Pygments==2.17.2
76
+ pyparsing==3.1.1
77
+ python-dateutil==2.8.2
78
+ pytz==2023.4
79
+ PyYAML==6.0.1
80
+ pyzmq==25.1.2
81
+ referencing==0.33.0
82
+ regex==2023.12.25
83
+ requests==2.31.0
84
+ rich==13.7.0
85
+ rpds-py==0.17.1
86
+ safetensors==0.4.2
87
+ scikit-learn==1.4.0
88
+ scipy==1.12.0
89
+ six==1.16.0
90
+ smart-open==6.4.0
91
+ smmap==5.0.1
92
+ stack-data==0.6.3
93
+ streamlit==1.30.0
94
+ sympy==1.12
95
+ tenacity==8.2.3
96
+ threadpoolctl==3.2.0
97
+ tokenizers==0.15.1
98
+ toml==0.10.2
99
+ toolz==0.12.1
100
+ torch==2.2.0
101
+ torchmetrics==1.3.0.post0
102
+ torchutils==0.0.4
103
+ tornado==6.4
104
+ tqdm==4.66.1
105
+ traitlets==5.14.1
106
+ transformers==4.37.2
107
+ triton==2.2.0
108
+ typing_extensions==4.9.0
109
+ tzdata==2023.4
110
+ tzlocal==5.2
111
+ urllib3==2.2.0
112
+ validators==0.22.0
113
+ watchdog==3.0.0
114
+ wcwidth==0.2.13
115
+ zipp==3.17.0