first
Browse files- __pycache__/preprocessing.cpython-310.pyc +0 -0
- app_models/__pycache__/bag_of_words_MODEL.cpython-310.pyc +0 -0
- app_models/__pycache__/gpt_MODEL.cpython-310.pyc +0 -0
- app_models/__pycache__/lstm_MODEL.cpython-310.pyc +0 -0
- app_models/__pycache__/rubert_MODEL.cpython-310.pyc +0 -0
- app_models/__pycache__/toxicity_MODEL.cpython-310.pyc +0 -0
- app_models/bag_of_words_MODEL.py +20 -0
- app_models/gpt_MODEL.py +37 -0
- app_models/lstm_MODEL.py +90 -0
- app_models/rubert_MODEL.py +33 -0
- app_models/toxicity_MODEL.py +20 -0
- app_pages/__pycache__/page1_model_comparison.cpython-310.pyc +0 -0
- app_pages/__pycache__/page2_rubert_toxicity.cpython-310.pyc +0 -0
- app_pages/__pycache__/page3_gpt_model.cpython-310.pyc +0 -0
- app_pages/page1_model_comparison.py +21 -0
- app_pages/page2_rubert_toxicity.py +20 -0
- app_pages/page3_gpt_model.py +14 -0
- main_app.py +14 -0
- model_data/bow_model.joblib +3 -0
- model_data/bow_vectorizer.joblib +3 -0
- model_data/finetuned_gpt/config.json +41 -0
- model_data/finetuned_gpt/generation_config.json +7 -0
- model_data/finetuned_gpt/merges.txt +0 -0
- model_data/finetuned_gpt/model.safetensors +3 -0
- model_data/finetuned_gpt/special_tokens_map.json +37 -0
- model_data/finetuned_gpt/tokenizer_config.json +58 -0
- model_data/finetuned_gpt/vocab.json +0 -0
- model_data/logreg_model_v2.joblib +3 -0
- model_data/lstm_model.pth +3 -0
- model_data/vocab_kinopoisk_lstm.json +0 -0
- preprocessing.py +57 -0
- requirements.txt +115 -0
__pycache__/preprocessing.cpython-310.pyc
ADDED
Binary file (2.32 kB). View file
|
|
app_models/__pycache__/bag_of_words_MODEL.cpython-310.pyc
ADDED
Binary file (630 Bytes). View file
|
|
app_models/__pycache__/gpt_MODEL.cpython-310.pyc
ADDED
Binary file (1.08 kB). View file
|
|
app_models/__pycache__/lstm_MODEL.cpython-310.pyc
ADDED
Binary file (3.49 kB). View file
|
|
app_models/__pycache__/rubert_MODEL.cpython-310.pyc
ADDED
Binary file (1.43 kB). View file
|
|
app_models/__pycache__/toxicity_MODEL.cpython-310.pyc
ADDED
Binary file (985 Bytes). View file
|
|
app_models/bag_of_words_MODEL.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import joblib
|
3 |
+
from preprocessing import data_preprocessing
|
4 |
+
|
5 |
+
# Load your trained BoW model and vectorizer
|
6 |
+
vectorizer_path = 'model_data/bow_vectorizer.joblib'
|
7 |
+
model_path = 'model_data/bow_model.joblib'
|
8 |
+
vectorizer = joblib.load(vectorizer_path)
|
9 |
+
model = joblib.load(model_path)
|
10 |
+
|
11 |
+
# Streamlit UI
|
12 |
+
|
13 |
+
def predict(input):
|
14 |
+
|
15 |
+
processed_text = data_preprocessing(input)
|
16 |
+
user_input_bow = vectorizer.transform([processed_text])
|
17 |
+
# Make a prediction
|
18 |
+
prediction = model.predict(user_input_bow)
|
19 |
+
return prediction
|
20 |
+
# User text input
|
app_models/gpt_MODEL.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
2 |
+
import torch
|
3 |
+
|
4 |
+
# Load the model and tokenizer
|
5 |
+
model_path = '/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/finetuned_model'
|
6 |
+
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
|
7 |
+
model = GPT2LMHeadModel.from_pretrained(model_path)
|
8 |
+
|
9 |
+
# Move model to GPU if available
|
10 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
11 |
+
model.to(device)
|
12 |
+
|
13 |
+
def generate_text(prompt_text, length, temperature):
|
14 |
+
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
|
15 |
+
encoded_prompt = encoded_prompt.to(device)
|
16 |
+
|
17 |
+
output_sequences = model.generate(
|
18 |
+
input_ids=encoded_prompt,
|
19 |
+
max_length=length,
|
20 |
+
temperature=temperature,
|
21 |
+
top_k=20,
|
22 |
+
top_p=0.9,
|
23 |
+
repetition_penalty=1.2,
|
24 |
+
do_sample=True,
|
25 |
+
num_return_sequences=1,
|
26 |
+
)
|
27 |
+
|
28 |
+
# Decode the generated text
|
29 |
+
generated_sequence = output_sequences[0].tolist()
|
30 |
+
text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
|
31 |
+
|
32 |
+
# Remove the prompt from the generated text
|
33 |
+
text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
|
34 |
+
|
35 |
+
return text.strip()
|
36 |
+
|
37 |
+
# Streamlit interface
|
app_models/lstm_MODEL.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from preprocessing import preprocess_single_string
|
6 |
+
|
7 |
+
with open('model_data/vocab_kinopoisk_lstm.json', 'r') as file:
|
8 |
+
vocab_to_int = json.load(file)
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class ConfigRNN:
|
12 |
+
vocab_size: int
|
13 |
+
device : str
|
14 |
+
n_layers : int
|
15 |
+
embedding_dim : int
|
16 |
+
hidden_size : int
|
17 |
+
seq_len : int
|
18 |
+
bidirectional : bool or int
|
19 |
+
|
20 |
+
net_config = ConfigRNN(
|
21 |
+
vocab_size = len(vocab_to_int)+1,
|
22 |
+
device='cpu',
|
23 |
+
n_layers=3,
|
24 |
+
embedding_dim=64,
|
25 |
+
hidden_size=64,
|
26 |
+
seq_len = 100,
|
27 |
+
bidirectional=False
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class LSTMClassifier(nn.Module):
|
32 |
+
def __init__(self, rnn_conf = net_config) -> None:
|
33 |
+
super().__init__()
|
34 |
+
|
35 |
+
self.embedding_dim = rnn_conf.embedding_dim
|
36 |
+
self.hidden_size = rnn_conf.hidden_size
|
37 |
+
self.bidirectional = rnn_conf.bidirectional
|
38 |
+
self.n_layers = rnn_conf.n_layers
|
39 |
+
|
40 |
+
self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
|
41 |
+
self.lstm = nn.LSTM(
|
42 |
+
input_size = self.embedding_dim,
|
43 |
+
hidden_size = self.hidden_size,
|
44 |
+
bidirectional = self.bidirectional,
|
45 |
+
batch_first = True,
|
46 |
+
num_layers = self.n_layers
|
47 |
+
)
|
48 |
+
self.bidirect_factor = 2 if self.bidirectional else 1
|
49 |
+
self.clf = nn.Sequential(
|
50 |
+
nn.Linear(self.hidden_size * self.bidirect_factor, 32),
|
51 |
+
nn.Tanh(),
|
52 |
+
nn.Dropout(),
|
53 |
+
nn.Linear(32, 3)
|
54 |
+
)
|
55 |
+
|
56 |
+
def model_description(self):
|
57 |
+
direction = 'bidirect' if self.bidirectional else 'onedirect'
|
58 |
+
return f'lstm_{direction}_{self.n_layers}'
|
59 |
+
|
60 |
+
|
61 |
+
def forward(self, x: torch.Tensor):
|
62 |
+
embeddings = self.embedding(x)
|
63 |
+
out, _ = self.lstm(embeddings)
|
64 |
+
out = out[:, -1, :] # [все элементы батча, последний h_n, все элементы последнего h_n]
|
65 |
+
out = self.clf(out.squeeze())
|
66 |
+
return out
|
67 |
+
|
68 |
+
|
69 |
+
def load_lstm_model():
|
70 |
+
model = LSTMClassifier()
|
71 |
+
model.load_state_dict(torch.load('model_data/lstm_model.pth'))
|
72 |
+
model.eval()
|
73 |
+
return model
|
74 |
+
model = load_lstm_model()
|
75 |
+
|
76 |
+
|
77 |
+
def predict_review(review_text, model=model, net_config=net_config, vocab_to_int=vocab_to_int):
|
78 |
+
sample = preprocess_single_string(review_text, net_config.seq_len, vocab_to_int)
|
79 |
+
model.eval()
|
80 |
+
with torch.no_grad():
|
81 |
+
output = model(sample.unsqueeze(0)).to(net_config.device)
|
82 |
+
if output.dim() == 1:
|
83 |
+
output = output.unsqueeze(0) # Adjust if necessary
|
84 |
+
_, predicted_class = torch.max(output, dim=1)
|
85 |
+
if predicted_class.item() == 0:
|
86 |
+
return "Это положительный комментарий! Хорошо, что тебе понравился этот фильм! Можешь перейти в раздел с моделью GPT2 и обсудить с ней фильм!"
|
87 |
+
elif predicted_class.item() == 1:
|
88 |
+
return "Скорее всего... это комментарий нейтрального характера.. какой-то ты скучный..."
|
89 |
+
else:
|
90 |
+
return "Ты что такой токсик? Будь сдержанее, не понравился фильм - пройди мимо и не порьт авторам настроение, они же старались!"
|
app_models/rubert_MODEL.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.linear_model import LogisticRegression
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
# Load RuBERT model and tokenizer
|
10 |
+
rubert_model_name = "cointegrated/rubert-tiny2" # Example model name, adjust as needed
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(rubert_model_name)
|
12 |
+
model = AutoModel.from_pretrained(rubert_model_name)
|
13 |
+
|
14 |
+
# Load Logistic Regression model
|
15 |
+
logreg_model_path = "/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/logreg_model_v2.joblib"
|
16 |
+
logreg_model = joblib.load(logreg_model_path)
|
17 |
+
|
18 |
+
def embed_bert_cls(text, model, tokenizer):
|
19 |
+
"""Generate embeddings for input text using the RuBERT model."""
|
20 |
+
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
21 |
+
with torch.no_grad():
|
22 |
+
outputs = model(**inputs)
|
23 |
+
embeddings = outputs.last_hidden_state[:, 0, :]
|
24 |
+
embeddings = torch.nn.functional.normalize(embeddings)
|
25 |
+
return embeddings.cpu().numpy()
|
26 |
+
|
27 |
+
def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model):
|
28 |
+
"""Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression."""
|
29 |
+
embeddings = embed_bert_cls(text, model, tokenizer)
|
30 |
+
prediction = classifier.predict(embeddings)
|
31 |
+
dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'}
|
32 |
+
|
33 |
+
return dict_class[prediction[0]]
|
app_models/toxicity_MODEL.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
|
4 |
+
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
6 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
7 |
+
if torch.cuda.is_available():
|
8 |
+
model.cuda()
|
9 |
+
|
10 |
+
def text2toxicity(text, aggregate=True):
|
11 |
+
""" Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
|
12 |
+
with torch.no_grad():
|
13 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
14 |
+
proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
|
15 |
+
if isinstance(text, str):
|
16 |
+
proba = proba[0]
|
17 |
+
if aggregate:
|
18 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
19 |
+
return proba
|
20 |
+
|
app_pages/__pycache__/page1_model_comparison.cpython-310.pyc
ADDED
Binary file (904 Bytes). View file
|
|
app_pages/__pycache__/page2_rubert_toxicity.cpython-310.pyc
ADDED
Binary file (794 Bytes). View file
|
|
app_pages/__pycache__/page3_gpt_model.cpython-310.pyc
ADDED
Binary file (845 Bytes). View file
|
|
app_pages/page1_model_comparison.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from app_models.rubert_MODEL import classify_text
|
3 |
+
from app_models.bag_of_words_MODEL import predict
|
4 |
+
from app_models.lstm_MODEL import predict_review
|
5 |
+
|
6 |
+
class_prefix = 'This review is likely...'
|
7 |
+
|
8 |
+
def run():
|
9 |
+
st.title("Movie Review Classification")
|
10 |
+
st.write("This page will compare three models: Bag of Words/TF-IDF, LSTM, and BERT.")
|
11 |
+
|
12 |
+
# Example placeholder for user input
|
13 |
+
user_input = st.text_area("")
|
14 |
+
|
15 |
+
# Placeholder buttons for model selection
|
16 |
+
if st.button('Classify with BoW/TF-IDF'):
|
17 |
+
st.write(f'{class_prefix}{predict(user_input)}')
|
18 |
+
if st.button('Classify with LSTM'):
|
19 |
+
st.write(f'{class_prefix}{predict_review(user_input)}')
|
20 |
+
if st.button('Classify with ruBERT'):
|
21 |
+
st.write(f'{class_prefix}{classify_text(user_input)}')
|
app_pages/page2_rubert_toxicity.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from app_models.toxicity_MODEL import text2toxicity
|
4 |
+
|
5 |
+
|
6 |
+
def run():
|
7 |
+
st.title('Toxicity Detection')
|
8 |
+
st.write('This tool classifies text as toxic or non-toxic using RuBERT.')
|
9 |
+
|
10 |
+
user_input = st.text_area("Enter text to classify", "Type your text here...")
|
11 |
+
|
12 |
+
if st.button('Classify'):
|
13 |
+
toxicity_score = text2toxicity(user_input)
|
14 |
+
st.write('Toxicity score:', toxicity_score)
|
15 |
+
|
16 |
+
# Optional: Interpret the score for the user
|
17 |
+
if toxicity_score > 0.5:
|
18 |
+
st.write("This text is likely to be considered toxic.")
|
19 |
+
else:
|
20 |
+
st.write("This text is likely to be considered non-toxic.")
|
app_pages/page3_gpt_model.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from app_models.gpt_MODEL import generate_text
|
3 |
+
|
4 |
+
|
5 |
+
def run():
|
6 |
+
st.title('GPT Text Generation')
|
7 |
+
prompt_text = st.text_area("Input Text", "Type here...")
|
8 |
+
length = st.slider("Length of Generated Text", min_value=50, max_value=500, value=200)
|
9 |
+
temperature = st.slider("Temperature", min_value=0.1, max_value=1.0, value=0.7, step=0.1)
|
10 |
+
|
11 |
+
if st.button('Generate Text'):
|
12 |
+
with st.spinner('Generating...'):
|
13 |
+
generated_text = generate_text(prompt_text, length, temperature)
|
14 |
+
st.text_area("Generated Text", generated_text, height=250)
|
main_app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
|
4 |
+
from app_pages import page1_model_comparison, page2_rubert_toxicity, page3_gpt_model
|
5 |
+
|
6 |
+
st.sidebar.title('Navigation')
|
7 |
+
selection = st.sidebar.radio("Go to", ["Model Comparison", "RuBERT Toxicity Detection", "GPT Model"])
|
8 |
+
|
9 |
+
if selection == "Model Comparison":
|
10 |
+
page1_model_comparison.run()
|
11 |
+
elif selection == "RuBERT Toxicity Detection":
|
12 |
+
page2_rubert_toxicity.run()
|
13 |
+
elif selection == "GPT Model":
|
14 |
+
page3_gpt_model.run()
|
model_data/bow_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5b6a472e5fd4a44099fdde129a19e7fdf4f7c078d88b9fb53bd0ed4508a46ac
|
3 |
+
size 3942479
|
model_data/bow_vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d760947efd85a7b54b7f17c20215386cda613e8329a1c55a76cc8e4707faae19
|
3 |
+
size 4126902
|
model_data/finetuned_gpt/config.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"gradient_checkpointing": false,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"label2id": {
|
17 |
+
"LABEL_0": 0
|
18 |
+
},
|
19 |
+
"layer_norm_epsilon": 1e-05,
|
20 |
+
"model_type": "gpt2",
|
21 |
+
"n_ctx": 2048,
|
22 |
+
"n_embd": 768,
|
23 |
+
"n_head": 12,
|
24 |
+
"n_inner": null,
|
25 |
+
"n_layer": 12,
|
26 |
+
"n_positions": 2048,
|
27 |
+
"pad_token_id": 0,
|
28 |
+
"reorder_and_upcast_attn": false,
|
29 |
+
"resid_pdrop": 0.1,
|
30 |
+
"scale_attn_by_inverse_layer_idx": false,
|
31 |
+
"scale_attn_weights": true,
|
32 |
+
"summary_activation": null,
|
33 |
+
"summary_first_dropout": 0.1,
|
34 |
+
"summary_proj_to_labels": true,
|
35 |
+
"summary_type": "cls_index",
|
36 |
+
"summary_use_proj": true,
|
37 |
+
"torch_dtype": "float32",
|
38 |
+
"transformers_version": "4.37.2",
|
39 |
+
"use_cache": true,
|
40 |
+
"vocab_size": 50264
|
41 |
+
}
|
model_data/finetuned_gpt/generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.37.2"
|
7 |
+
}
|
model_data/finetuned_gpt/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_data/finetuned_gpt/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cf4a373f976b99cfdc892f83394728c1d04a62d45b0be7e923fcb9b4128d6ba
|
3 |
+
size 500941440
|
model_data/finetuned_gpt/special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"mask_token": {
|
17 |
+
"content": "<mask>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"pad_token": {
|
24 |
+
"content": "<pad>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "<unk>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
model_data/finetuned_gpt/tokenizer_config.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<pad>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"3": {
|
30 |
+
"content": "<unk>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": true,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"4": {
|
38 |
+
"content": "<mask>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"bos_token": "<s>",
|
47 |
+
"clean_up_tokenization_spaces": true,
|
48 |
+
"eos_token": "</s>",
|
49 |
+
"errors": "replace",
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"model_max_length": 2048,
|
52 |
+
"pad_token": "<pad>",
|
53 |
+
"padding_side": "left",
|
54 |
+
"tokenizer_class": "GPT2Tokenizer",
|
55 |
+
"truncation_side": "left",
|
56 |
+
"trust_remote_code": false,
|
57 |
+
"unk_token": "<unk>"
|
58 |
+
}
|
model_data/finetuned_gpt/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_data/logreg_model_v2.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92d11a87982a5d6f81eb49df6ceb5640aedefbf692df41f1c03b201c8bd7f032
|
3 |
+
size 8383
|
model_data/lstm_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba752f56cf91b275b17d9e7f774661db66297e074d2cfedf865645dce1045b43
|
3 |
+
size 4496930
|
model_data/vocab_kinopoisk_lstm.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessing.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import unicodedata
|
6 |
+
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
stop_words = set(stopwords.words('russian', 'english'))
|
9 |
+
|
10 |
+
def data_preprocessing(text: str) -> str:
|
11 |
+
|
12 |
+
text = text.lower()
|
13 |
+
text = text.replace('-', ' ').replace('\n', ' ')
|
14 |
+
|
15 |
+
text = re.sub('<.*?>', '', text)
|
16 |
+
text = ''.join([c for c in text if unicodedata.category(c).startswith(('L', 'N', 'Z')) or c == "'"])
|
17 |
+
text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
|
18 |
+
text = ' '.join([word for word in text.split() if not word.isdigit()])
|
19 |
+
return text
|
20 |
+
|
21 |
+
|
22 |
+
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
|
23 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
24 |
+
|
25 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
26 |
+
|
27 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
28 |
+
for i, review in enumerate(review_int):
|
29 |
+
if len(review) <= seq_len:
|
30 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
31 |
+
new = zeros + review
|
32 |
+
else:
|
33 |
+
new = review[: seq_len]
|
34 |
+
features[i, :] = np.array(new)
|
35 |
+
|
36 |
+
return features
|
37 |
+
|
38 |
+
def preprocess_single_string(
|
39 |
+
input_string: str,
|
40 |
+
seq_len: int,
|
41 |
+
vocab_to_int: dict,
|
42 |
+
verbose : bool = False
|
43 |
+
) -> torch.tensor:
|
44 |
+
|
45 |
+
|
46 |
+
preprocessed_string = data_preprocessing(input_string)
|
47 |
+
result_list = []
|
48 |
+
for word in preprocessed_string.split():
|
49 |
+
try:
|
50 |
+
result_list.append(vocab_to_int[word])
|
51 |
+
except KeyError as e:
|
52 |
+
if verbose:
|
53 |
+
print(f'{e}: not in dictionary!')
|
54 |
+
pass
|
55 |
+
result_padded = padding([result_list], seq_len)[0]
|
56 |
+
|
57 |
+
return torch.tensor(result_padded)
|
requirements.txt
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.26.1
|
2 |
+
altair==5.2.0
|
3 |
+
asttokens==2.4.1
|
4 |
+
attrs==23.2.0
|
5 |
+
blinker==1.7.0
|
6 |
+
cachetools==5.3.2
|
7 |
+
certifi==2023.11.17
|
8 |
+
charset-normalizer==3.3.2
|
9 |
+
click==8.1.7
|
10 |
+
comm==0.2.1
|
11 |
+
contourpy==1.2.0
|
12 |
+
cycler==0.12.1
|
13 |
+
debugpy==1.8.0
|
14 |
+
decorator==5.1.1
|
15 |
+
exceptiongroup==1.2.0
|
16 |
+
executing==2.0.1
|
17 |
+
filelock==3.13.1
|
18 |
+
fonttools==4.47.2
|
19 |
+
fsspec==2023.12.2
|
20 |
+
gensim==4.3.2
|
21 |
+
gitdb==4.0.11
|
22 |
+
GitPython==3.1.41
|
23 |
+
huggingface-hub==0.20.3
|
24 |
+
idna==3.6
|
25 |
+
imbalanced-learn==0.12.0
|
26 |
+
imblearn==0.0
|
27 |
+
importlib-metadata==7.0.1
|
28 |
+
ipykernel==6.29.0
|
29 |
+
ipython==8.21.0
|
30 |
+
jedi==0.19.1
|
31 |
+
Jinja2==3.1.3
|
32 |
+
joblib==1.3.2
|
33 |
+
jsonlines==4.0.0
|
34 |
+
jsonschema==4.21.1
|
35 |
+
jsonschema-specifications==2023.12.1
|
36 |
+
jupyter_client==8.6.0
|
37 |
+
jupyter_core==5.7.1
|
38 |
+
kiwisolver==1.4.5
|
39 |
+
lightning-utilities==0.10.1
|
40 |
+
markdown-it-py==3.0.0
|
41 |
+
MarkupSafe==2.1.4
|
42 |
+
matplotlib==3.8.2
|
43 |
+
matplotlib-inline==0.1.6
|
44 |
+
mdurl==0.1.2
|
45 |
+
mpmath==1.3.0
|
46 |
+
nest-asyncio==1.6.0
|
47 |
+
networkx==3.2.1
|
48 |
+
nltk==3.8.1
|
49 |
+
numpy==1.26.3
|
50 |
+
nvidia-cublas-cu12==12.1.3.1
|
51 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
52 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
53 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
54 |
+
nvidia-cudnn-cu12==8.9.2.26
|
55 |
+
nvidia-cufft-cu12==11.0.2.54
|
56 |
+
nvidia-curand-cu12==10.3.2.106
|
57 |
+
nvidia-cusolver-cu12==11.4.5.107
|
58 |
+
nvidia-cusparse-cu12==12.1.0.106
|
59 |
+
nvidia-nccl-cu12==2.19.3
|
60 |
+
nvidia-nvjitlink-cu12==12.3.101
|
61 |
+
nvidia-nvtx-cu12==12.1.105
|
62 |
+
packaging==23.2
|
63 |
+
pandas==2.2.0
|
64 |
+
parso==0.8.3
|
65 |
+
pexpect==4.9.0
|
66 |
+
pillow==10.2.0
|
67 |
+
platformdirs==4.2.0
|
68 |
+
prompt-toolkit==3.0.43
|
69 |
+
protobuf==4.25.2
|
70 |
+
psutil==5.9.8
|
71 |
+
ptyprocess==0.7.0
|
72 |
+
pure-eval==0.2.2
|
73 |
+
pyarrow==15.0.0
|
74 |
+
pydeck==0.8.1b0
|
75 |
+
Pygments==2.17.2
|
76 |
+
pyparsing==3.1.1
|
77 |
+
python-dateutil==2.8.2
|
78 |
+
pytz==2023.4
|
79 |
+
PyYAML==6.0.1
|
80 |
+
pyzmq==25.1.2
|
81 |
+
referencing==0.33.0
|
82 |
+
regex==2023.12.25
|
83 |
+
requests==2.31.0
|
84 |
+
rich==13.7.0
|
85 |
+
rpds-py==0.17.1
|
86 |
+
safetensors==0.4.2
|
87 |
+
scikit-learn==1.4.0
|
88 |
+
scipy==1.12.0
|
89 |
+
six==1.16.0
|
90 |
+
smart-open==6.4.0
|
91 |
+
smmap==5.0.1
|
92 |
+
stack-data==0.6.3
|
93 |
+
streamlit==1.30.0
|
94 |
+
sympy==1.12
|
95 |
+
tenacity==8.2.3
|
96 |
+
threadpoolctl==3.2.0
|
97 |
+
tokenizers==0.15.1
|
98 |
+
toml==0.10.2
|
99 |
+
toolz==0.12.1
|
100 |
+
torch==2.2.0
|
101 |
+
torchmetrics==1.3.0.post0
|
102 |
+
torchutils==0.0.4
|
103 |
+
tornado==6.4
|
104 |
+
tqdm==4.66.1
|
105 |
+
traitlets==5.14.1
|
106 |
+
transformers==4.37.2
|
107 |
+
triton==2.2.0
|
108 |
+
typing_extensions==4.9.0
|
109 |
+
tzdata==2023.4
|
110 |
+
tzlocal==5.2
|
111 |
+
urllib3==2.2.0
|
112 |
+
validators==0.22.0
|
113 |
+
watchdog==3.0.0
|
114 |
+
wcwidth==0.2.13
|
115 |
+
zipp==3.17.0
|