nlp_proj

Runtime error

App Files Files Community

Maslov-Artem commited on Mar 7

Commit

cb2adb5

•

1 Parent(s): 7983c1c

Add 3 classifiers

Browse files

Files changed (15) hide show

17/config.json +41 -0
17/generation_config.json +7 -0
17/model.safetensors +3 -0
model/__init__.py +0 -0
model/best_bert_weights.pth +3 -0
model/funcs.py +218 -0
model/int_vocab.json +3 -0
model/model.py +113 -0
model/model_weights.pt +3 -0
model/vocab.json +3 -0
pages/review_predictor.py +59 -11
pages/text_generator.py +9 -4
preprocessing/__init__.py +0 -0
preprocessing/preprocessing.py +30 -0
preprocessing/rnn_preprocessing.py +81 -0

17/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 1,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 2048,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "vocab_size": 50264
+}

17/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.38.2"
+}

17/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a99f27f7efc5a609d3bb2f30d15980d3384ecd47f4b0806c251523071a7648a
+size 500941440

model/__init__.py ADDED Viewed

File without changes

model/best_bert_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0e1a89d2cb79075e1a4de471ef11654117952234bb65dee721e2909099fa4d4
+size 117120027

model/funcs.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+from sklearn.metrics import f1_score
+from torch.utils.data import Dataset
+def create_model_and_tokenizer(model_class, tokenizer_class, pretrained_weights):
+    # Создаем объекты для токенизатора и модели
+    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
+    model = model_class.from_pretrained(pretrained_weights)
+    return model, tokenizer
+def train_model(
+    DEVICE, epochs, model, train_loader, valid_loader, optimizer, criterion
+):
+    # Создаем папку для сохранения весов, если она еще не существует
+    if not os.path.exists("weights"):
+        os.makedirs("weights")
+    # Инициализация списков для сохранения значений потерь и точности
+    train_losses = []
+    train_accuracies = []
+    val_losses = []
+    val_accuracies = []
+    val_f1_scores = []
+    best_val_loss = float("inf")
+    for epoch in range(epochs):
+        model.train()
+        train_loss = 0.0
+        total = 0
+        correct = 0
+        for batch in train_loader:
+            optimizer.zero_grad()
+            input_ids, attention_mask, labels = batch
+            input_ids = input_ids.to(DEVICE)
+            attention_mask = attention_mask.to(DEVICE)
+            labels = labels.to(DEVICE)
+            outputs = model(input_ids, attention_mask=attention_mask)
+            loss = criterion(outputs, labels.float().unsqueeze(1))
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            preds = torch.round(torch.sigmoid(outputs))
+            total += labels.size(0)
+            correct += (preds == labels.unsqueeze(1)).sum().item()
+        accuracy = correct / total
+        avg_train_loss = train_loss / len(train_loader)
+        train_losses.append(avg_train_loss)
+        train_accuracies.append(accuracy)
+        model.eval()
+        val_loss = 0.0
+        total_preds = []
+        total_labels = []
+        with torch.no_grad():
+            total = 0
+            correct = 0
+            for batch in valid_loader:
+                input_ids, attention_mask, labels = batch
+                input_ids = input_ids.to(DEVICE)
+                attention_mask = attention_mask.to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask=attention_mask)
+                loss = criterion(outputs, labels.float().unsqueeze(1))
+                val_loss += loss.item()
+                preds = torch.round(torch.sigmoid(outputs))
+                total += labels.size(0)
+                correct += (preds == labels.unsqueeze(1)).sum().item()
+                total_preds.extend(preds.detach().cpu().numpy())
+                total_labels.extend(labels.detach().cpu().numpy())
+            accuracy = correct / total
+            f1 = f1_score(total_labels, total_preds)
+            avg_val_loss = val_loss / len(valid_loader)
+            val_losses.append(avg_val_loss)
+            val_accuracies.append(accuracy)
+            val_f1_scores.append(f1)
+            # Если это лучшая модель, сохраняем веса
+            if avg_val_loss < best_val_loss:
+                best_val_loss = avg_val_loss
+                torch.save(model.state_dict(), "weights/best_bert_weights.pth")
+        print(f"Epoch {epoch+1}")
+        print(
+            f"Training Loss: {train_losses[-1]:.4f}. Validation Loss: {val_losses[-1]:.4f}"
+        )
+        print(
+            f"Training Accuracy : {train_accuracies[-1]:.4f}. Validation Accuracy : {val_accuracies[-1]:.4f}"
+        )
+        print(25 * "==")
+    return train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
+def predict_sentiment(text, model, tokenizer, DEVICE):
+    # Модель должна быть в режиме оценки
+    model.eval()
+    # Токенизируем текст и конвертируем в тензор
+    encoding = tokenizer.encode_plus(
+        text, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
+    )
+    input_ids = encoding["input_ids"].to(DEVICE)
+    attention_mask = encoding["attention_mask"].to(DEVICE)
+    # Прогоняем текст через модель
+    with torch.no_grad():
+        output = model(input_ids, attention_mask=attention_mask)
+    # Преобразуем выход модели в вероятность с помощью сигмоиды
+    probability = torch.sigmoid(output).item()
+    # Задаем порог
+    threshold = 0.5
+    # Возвращаем вероятность положительного или отрицательного класса
+    if probability >= threshold:
+        return 1
+        # return f"С вероятностью {probability*100:.2f}% это положительный отзыв"
+    else:
+        return 0
+        # return f"С вероятностью {(1-probability)*100:.2f}% это отрицательный отзыв"
+def load_model(model_class, pretrained_weights, weights_path):
+    # Создаем экземпляр классификатора
+    model = ruBERTClassifier(model_class, pretrained_weights)
+    # Загружаем веса
+    model.load_state_dict(torch.load(weights_path, map_location="cpu"))
+    return model
+def plot_metrics(
+    train_losses, train_accuracies, val_losses, val_accuracies, val_f1_scores
+):
+    epochs = range(1, len(train_losses) + 1)
+    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
+    # Первый подграфик для потерь
+    axs[0].plot(epochs, train_losses, "r--", label="Training Loss")
+    axs[0].plot(epochs, val_losses, "b--", linewidth=2, label="Validation Loss")
+    axs[0].set_title("Training and Validation Loss")
+    axs[0].set_xlabel("Epochs")
+    axs[0].set_ylabel("Loss")
+    axs[0].legend()
+    # Второй подграфик для точности и F1-оценки
+    axs[1].plot(epochs, train_accuracies, "r-", linewidth=2, label="Training Accuracy")
+    axs[1].plot(epochs, val_accuracies, "b-", linewidth=2, label="Validation Accuracy")
+    axs[1].plot(epochs, val_f1_scores, "g-", linewidth=2, label="Validation F1 Score")
+    axs[1].set_title("Training and Validation Accuracy and F1 Score")
+    axs[1].set_xlabel("Epochs")
+    axs[1].set_ylabel("Metric Value")
+    axs[1].legend()
+    plt.tight_layout()
+    plt.savefig("metrics_plot.png")  # Сохраняем рисунок в файл
+    plt.show()
+class TextClassificationDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = self.texts[idx]
+        label = self.labels[idx]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        return (
+            encoding["input_ids"].squeeze(),
+            encoding["attention_mask"].squeeze(),
+            torch.tensor(label),
+        )
+class ruBERTClassifier(nn.Module):
+    def __init__(self, model_class, pretrained_weights):
+        super().__init__()
+        self.bert = model_class.from_pretrained(pretrained_weights)
+        # Замораживаем все параметры
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        # Размораживаем слой BertPooler
+        for param in self.bert.pooler.parameters():
+            param.requires_grad = True
+        self.linear = nn.Sequential(
+            nn.Linear(312, 256),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(256, 1),
+        )
+    def forward(self, x, attention_mask):
+        bert_out = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
+        out = self.linear(bert_out)
+        return out

model/int_vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c109e13d36a06af12b0a0b65fe09cf5af212a12d95ad715b272d3e0a757ca9c7
+size 13374732

model/model.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from typing import Tuple
+import torch
+import torch.nn as nn
+HIDDEN_SIZE = 32
+VOCAB_SIZE =196906
+EMBEDDING_DIM = 64 # embedding_dim
+SEQ_LEN = 100
+BATCH_SIZE = 64
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.W_q = nn.Linear(hidden_size, hidden_size)
+        self.W_k = nn.Linear(hidden_size, hidden_size)
+        self.W_v = nn.Linear(hidden_size, 1)
+        self.tanh = nn.Tanh()
+    def forward(
+        self,
+        lstm_outputs: torch.Tensor,  # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
+        final_hidden: torch.Tensor,  # BATCH_SIZE x HIDDEN_SIZE
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Bahdanau Attention module
+        Args:
+            keys (torch.Tensor): lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
+            query (torch.Tensor): lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
+        Returns:
+            Tuple[torch.Tensor]:
+                context_matrix (BATCH_SIZE, HIDDEN_SIZE)
+                attention scores (BATCH_SIZE, SEQ_LEN)
+        """
+        # input:
+        # keys – lstm hidden states (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
+        # query - lstm final hidden state (BATCH_SIZE, HIDDEN_SIZE)
+        keys = self.W_k(lstm_outputs)
+        # print(f'After linear keys: {keys.shape}')
+        query = self.W_q(final_hidden)
+        # print(f"After linear query: {query.shape}")
+        # print(f"query.unsqueeze(1) {query.unsqueeze(1).shape}")
+        sum = query.unsqueeze(1) + keys
+        # print(f"After sum: {sum.shape}")
+        tanhed = self.tanh(sum)
+        # print(f"After tanhed: {tanhed.shape}")
+        vector = self.W_v(tanhed).squeeze(-1)
+        # print(f"After linear vector: {vector.shape}")
+        att_weights = torch.softmax(vector, -1)
+        # print(f"After softmax att_weights: {att_weights.shape}")
+        context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze()
+        # print(f"After bmm context: {context.shape}")
+        return context, att_weights
+        # att_weights = self.linear(lstm_outputs)
+        # # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
+        # att_weights = self.linear(lstm_outputs)
+        # # print(f'After linear: {att_weights.shape, final_hidden.unsqueeze(2).shape}')
+        # att_weights = torch.bmm(att_weights, final_hidden.unsqueeze(2))
+        # # print(f'After bmm: {att_weights.shape}')
+        # att_weights = F.softmax(att_weights.squeeze(2), dim=1)
+        # # print(f'After softmax: {att_weights.shape}')
+        # cntxt = torch.bmm(lstm_outputs.transpose(1, 2), att_weights.unsqueeze(2))
+        # # print(f'Context: {cntxt.shape}')
+        # concatted = torch.cat((cntxt, final_hidden.unsqueeze(2)), dim=1)
+        # # print(f'Concatted: {concatted.shape}')
+        # att_hidden = self.tanh(self.align(concatted.squeeze(-1)))
+        # # print(f'Att Hidden: {att_hidden.shape}')
+        # return att_hidden, att_weights
+# Test on random numbers
+BahdanauAttention()(torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE), torch.randn(BATCH_SIZE, HIDDEN_SIZE))[1].shape
+class LSTMConcatAttentionEmbed(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
+        # self.embedding = embedding_layer
+        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
+        self.attn = BahdanauAttention(HIDDEN_SIZE)
+        self.clf = nn.Sequential(
+            nn.Linear(HIDDEN_SIZE, 128),
+            nn.Dropout(),
+            nn.Tanh(),
+            nn.Linear(128, 1)
+        )
+    def forward(self, x):
+        embeddings = self.embedding(x)
+        outputs, (h_n, _) = self.lstm(embeddings)
+        att_hidden, att_weights = self.attn(outputs, h_n.squeeze(0))
+        out = self.clf(att_hidden)
+        return out, att_weights

model/model_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de960bfb6327e0509297628c3cec5bc456e6dc681b29aca9bead6330e941d44e
+size 50489371

model/vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e07ef2cc3bbaac41bb510936e8b958c808ffdfcf2e60b39bb7c9d330a6fe67f8
+size 12980920

pages/review_predictor.py CHANGED Viewed

@@ -1,8 +1,17 @@
 import pickle
 import streamlit as st
-from preprocessing import data_preprocessing
 # Load preprocessing steps
 with open("vectorizer.pkl", "rb") as f:
@@ -12,29 +21,63 @@ with open("vectorizer.pkl", "rb") as f:
 with open("logreg_model.pkl", "rb") as f:
     logreg_predictor = pickle.load(f)
-# Define function for preprocessing input text
-@st.cache
-def preprocess_text(text):
     # Apply preprocessing steps (cleaning, tokenization, vectorization)
     clean_text = data_preprocessing(
         text
     )  # Assuming data_preprocessing is your preprocessing function
     print("Clean text ", clean_text)
-    vectorized_text = vectorizer.transform([" ".join(clean_text)])
     return vectorized_text
 # Define function for making predictions
-@st.cache
-def predict_sentiment(text):
     # Preprocess input text
-    processed_text = preprocess_text(text)
     # Make prediction
     prediction = logreg_predictor.predict(processed_text)
     return prediction
 st.sidebar.title("Model Selection")
 model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
 st.title("Review Prediction")
@@ -44,11 +87,14 @@ st.title("Sentiment Analysis with Logistic Regression")
 text_input = st.text_input("Enter your review:")
 if st.button("Predict"):
     if model_type == "Classic ML":
-        prediction = predict_sentiment(text_input)
     elif model_type == "LSTM":
-        prediction = 1
     elif model_type == "BERT":
-        prediction = 1
     if prediction == 1:
         st.write("prediction")
@@ -56,3 +102,5 @@ if st.button("Predict"):
     elif prediction == 0:
         st.write("prediction")
         st.write("Отзыв отрицательный")

+import json
 import pickle
+import pandas as pd
 import streamlit as st
+import torch
+import torch.nn as nn
+import transformers
+from model.funcs import (create_model_and_tokenizer, load_model,
+                         predict_sentiment)
+from model.model import LSTMConcatAttentionEmbed
+from preprocessing.preprocessing import data_preprocessing
+from preprocessing.rnn_preprocessing import preprocess_single_string
 # Load preprocessing steps
 with open("vectorizer.pkl", "rb") as f:
 with open("logreg_model.pkl", "rb") as f:
     logreg_predictor = pickle.load(f)
+model_concat_embed = LSTMConcatAttentionEmbed()
+model_concat_embed.load_state_dict(torch.load("model/model_weights.pt"))
+with open("model/vocab.json", "r") as f:
+    vocab_to_int = json.load(f)
+with open("model/int_vocab.json", "r") as f:
+    int_to_vocab = json.load(f)
+model_class = transformers.AutoModel
+tokenizer_class = transformers.AutoTokenizer
+pretrained_weights = "cointegrated/rubert-tiny2"
+weights_path = "model/best_bert_weights.pth"
+model = load_model(model_class, pretrained_weights, weights_path)
+tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
+def plot_and_predict(review: str, SEQ_LEN: int, model: nn.Module):
+    inp = preprocess_single_string(review, SEQ_LEN, vocab_to_int)
+    model.eval()
+    with torch.inference_mode():
+        pred, _ = model(inp.long().unsqueeze(0))
+    pred = pred.sigmoid().item()
+    return 1 if pred > 0.75 else 0
+def preprocess_text_logreg(text):
     # Apply preprocessing steps (cleaning, tokenization, vectorization)
     clean_text = data_preprocessing(
         text
     )  # Assuming data_preprocessing is your preprocessing function
     print("Clean text ", clean_text)
+    vectorized_text = logreg_vectorizer.transform([" ".join(clean_text)])
     return vectorized_text
 # Define function for making predictions
+def predict_sentiment_logreg(text):
     # Preprocess input text
+    processed_text = preprocess_text_logreg(text)
     # Make prediction
     prediction = logreg_predictor.predict(processed_text)
     return prediction
+metrics = {
+    "Models": ["Logistic Regression", "LSTM + attention", "ruBERTtiny2"],
+    "f1-macro score": [0.94376, 1, 0.94070],
+}
+col1, col2 = st.columns([1, 3])
+df = pd.DataFrame(metrics)
+df.set_index("Models", inplace=True)
+df.index.name = "Model"
 st.sidebar.title("Model Selection")
 model_type = st.sidebar.radio("Select Model Type", ["Classic ML", "LSTM", "BERT"])
 st.title("Review Prediction")
 text_input = st.text_input("Enter your review:")
 if st.button("Predict"):
     if model_type == "Classic ML":
+        prediction = predict_sentiment_logreg(text_input)
     elif model_type == "LSTM":
+        prediction = plot_and_predict(
+            review=text_input, SEQ_LEN=25, model=model_concat_embed
+        )
     elif model_type == "BERT":
+        prediction = predict_sentiment(text_input, model, tokenizer, "cpu")
+        st.write(prediction)
     if prediction == 1:
         st.write("prediction")
     elif prediction == 0:
         st.write("prediction")
         st.write("Отзыв отрицательный")
+st.write(df)

pages/text_generator.py CHANGED Viewed

@@ -2,12 +2,17 @@ import streamlit as st
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
-model_path = "finetuned_model/"
-model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
-tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-model = GPT2LMHeadModel.from_pretrained(model_path)
 promt = st.text_input("Ask a question")
 generate = st.button("Generate")
 if generate:

 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+@st.cache_data
+def load_model():
+    model_path = "17/"
+    model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    model = GPT2LMHeadModel.from_pretrained(model_path)
+    return tokenizer, model
+tokenizer, model = load_model()
 promt = st.text_input("Ask a question")
 generate = st.button("Generate")
 if generate:

preprocessing/__init__.py ADDED Viewed

File without changes

preprocessing/preprocessing.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import re
+import string
+import nltk
+import pymorphy2
+from nltk.tokenize import word_tokenize
+nltk.download("punkt")
+def clean_text(text: str) -> str:
+    text = text.lower()
+    text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
+    text = re.sub(r"\d+\w*", "", text)
+    text = re.sub(r"\[.*?\]", "", text)
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    return text
+def lemmize_and_tokenize_text(text: str) -> list[str]:
+    morph = pymorphy2.MorphAnalyzer()
+    tokens = word_tokenize(text)
+    lemmas = [morph.parse(token)[0].normal_form for token in tokens]
+    return lemmas
+def data_preprocessing(text: str) -> list[str]:
+    cleaned_text = clean_text(text)
+    lemmized_text = lemmize_and_tokenize_text(cleaned_text)
+    return lemmized_text

preprocessing/rnn_preprocessing.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import re
+import string
+import numpy as np
+import torch
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('english'))
+def data_preprocessing(text: str) -> str:
+    """preprocessing string: lowercase, removing html-tags, punctuation,
+                            stopwords, digits
+    Args:
+        text (str): input string for preprocessing
+    Returns:
+        str: preprocessed string
+    """
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
+    """Make left-sided padding for input list of tokens
+    Args:
+        review_int (list): input list of tokens
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+    Returns:
+        np.array: padded sequences
+    """
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    """Function for all preprocessing steps on a single string
+    Args:
+        input_string (str): input single string for preprocessing
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
+    Returns:
+        list: preprocessed string
+    """
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)