Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on May 31, 2024

Commit

2c1f9dd

1 Parent(s): e50b20c

Tokenization rework

Browse files

Files changed (4) hide show

app/cli.py +97 -26
app/data.py +115 -6
app/gui.py +4 -1
app/model.py +62 -87

app/cli.py CHANGED Viewed

@@ -55,6 +55,8 @@ def predict(model_path: Path, text: list[str]) -> None:
     import joblib
     text = " ".join(text).strip()
     if not sys.stdin.isatty():
         piped_text = sys.stdin.read().strip()
@@ -69,7 +71,8 @@ def predict(model_path: Path, text: list[str]) -> None:
     click.echo(DONE_STR)
     click.echo("Performing sentiment analysis... ", nl=False)
-    prediction = model.predict([text])[0]
     if prediction == 0:
         sentiment = click.style("NEGATIVE", fg="red")
     elif prediction == 1:
@@ -82,9 +85,9 @@ def predict(model_path: Path, text: list[str]) -> None:
 @cli.command()
 @click.option(
     "--dataset",
-    required=True,
-    help="Dataset to train the model on",
-    type=click.Choice(["sentiment140", "amazonreviews", "imdb50k"]),
 )
 @click.option(
     "--model",
@@ -100,27 +103,65 @@ def predict(model_path: Path, text: list[str]) -> None:
     show_default=True,
     type=click.IntRange(1, 50),
 )
 def evaluate(
-    dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     model_path: Path,
     cv: int,
 ) -> None:
-    """Evaluate the model on the test dataset"""
     import joblib
-    from app.data import load_data
     from app.model import evaluate_model
-    click.echo("Loading dataset... ", nl=False)
-    text_data, label_data = load_data(dataset)
-    click.echo(DONE_STR)
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
     click.echo("Evaluating model... ", nl=False)
-    acc_mean, acc_std = evaluate_model(model, text_data, label_data, folds=cv)
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
@@ -145,6 +186,18 @@ def evaluate(
     show_default=True,
     type=click.IntRange(1, 50),
 )
 @click.option(
     "--seed",
     default=42,
@@ -157,45 +210,63 @@ def evaluate(
     is_flag=True,
     help="Overwrite the model file if it already exists",
 )
 def train(
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     max_features: int,
     cv: int,
     seed: int,
     force: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     import joblib
-    from app.constants import MODELS_DIR
-    from app.data import load_data
-    from app.model import create_model, evaluate_model, train_model
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
     if model_path.exists() and not force:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
-    click.echo("Loading dataset... ", nl=False)
-    text_data, label_data = load_data(dataset)
-    click.echo(DONE_STR)
-    click.echo("Creating model... ", nl=False)
-    model = create_model(max_features, seed=None if seed == -1 else seed, verbose=True)
-    click.echo(DONE_STR)
     click.echo("Training model... ")
-    accuracy = train_model(model, text_data, label_data)
     click.echo("Model accuracy: ", nl=False)
     click.secho(f"{accuracy:.2%}", fg="blue")
     click.echo("Model saved to: ", nl=False)
-    joblib.dump(model, model_path)
     click.secho(str(model_path), fg="blue")
-    click.echo("Evaluating model... ", nl=False)
-    acc_mean, acc_std = evaluate_model(model, text_data, label_data, folds=cv)
-    click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
 def cli_wrapper() -> None:
     cli(max_content_width=120)

     import joblib
+    from app.model import infer_model
     text = " ".join(text).strip()
     if not sys.stdin.isatty():
         piped_text = sys.stdin.read().strip()
     click.echo(DONE_STR)
     click.echo("Performing sentiment analysis... ", nl=False)
+    prediction = infer_model(model, [text])[0]
+    # prediction = model.predict([text])[0]
     if prediction == 0:
         sentiment = click.style("NEGATIVE", fg="red")
     elif prediction == 1:
 @cli.command()
 @click.option(
     "--dataset",
+    default="test",
+    help="Dataset to evaluate the model on",
+    type=click.Choice(["test", "sentiment140", "amazonreviews", "imdb50k"]),
 )
 @click.option(
     "--model",
     show_default=True,
     type=click.IntRange(1, 50),
 )
+@click.option(
+    "--batch-size",
+    default=512,
+    help="Size of the batches used in tokenization",
+    show_default=True,
+)
+@click.option(
+    "--processes",
+    default=8,
+    help="Number of parallel jobs during tokenization",
+    show_default=True,
+)
+@click.option(
+    "--verbose",
+    is_flag=True,
+    help="Show verbose output",
+)
 def evaluate(
+    dataset: Literal["test", "sentiment140", "amazonreviews", "imdb50k"],
     model_path: Path,
     cv: int,
+    batch_size: int,
+    processes: int,
+    verbose: bool,
 ) -> None:
+    """Evaluate the model on the the specified dataset"""
     import joblib
+    from app.constants import CACHE_DIR
+    from app.data import load_data, tokenize
     from app.model import evaluate_model
+    cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
+    use_cached_data = False
+    if cached_data_path.exists():
+        use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
+    if use_cached_data:
+        click.echo("Loading cached data... ", nl=False)
+        token_data, label_data = joblib.load(cached_data_path)
+        click.echo(DONE_STR)
+    else:
+        click.echo("Loading dataset... ", nl=False)
+        text_data, label_data = load_data(dataset)
+        click.echo(DONE_STR)
+        click.echo("Tokenizing data... ", nl=False)
+        token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
+        joblib.dump((token_data, label_data), cached_data_path, compress=3)
+        click.echo(DONE_STR)
+        del text_data
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
     click.echo("Evaluating model... ", nl=False)
+    acc_mean, acc_std = evaluate_model(model, token_data, label_data, folds=cv, verbose=verbose)
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
     show_default=True,
     type=click.IntRange(1, 50),
 )
+@click.option(
+    "--batch-size",
+    default=512,
+    help="Size of the batches used in tokenization",
+    show_default=True,
+)
+@click.option(
+    "--processes",
+    default=8,
+    help="Number of parallel jobs during tokenization",
+    show_default=True,
+)
 @click.option(
     "--seed",
     default=42,
     is_flag=True,
     help="Overwrite the model file if it already exists",
 )
+@click.option(
+    "--verbose",
+    is_flag=True,
+    help="Show verbose output",
+)
 def train(
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     max_features: int,
     cv: int,
+    batch_size: int,
+    processes: int,
     seed: int,
     force: bool,
+    verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     import joblib
+    from app.constants import CACHE_DIR, MODELS_DIR
+    from app.data import load_data, tokenize
+    from app.model import create_model, train_model
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
     if model_path.exists() and not force:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
+    cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
+    use_cached_data = False
+    if cached_data_path.exists():
+        use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
+    if use_cached_data:
+        click.echo("Loading cached data... ", nl=False)
+        token_data, label_data = joblib.load(cached_data_path)
+        click.echo(DONE_STR)
+    else:
+        click.echo("Loading dataset... ", nl=False)
+        text_data, label_data = load_data(dataset)
+        click.echo(DONE_STR)
+        click.echo("Tokenizing data... ", nl=False)
+        token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
+        joblib.dump((token_data, label_data), cached_data_path, compress=3)
+        click.echo(DONE_STR)
+        del text_data
     click.echo("Training model... ")
+    model = create_model(max_features, seed=None if seed == -1 else seed, verbose=verbose)
+    trained_model, accuracy = train_model(model, token_data, label_data, folds=cv, seed=seed, verbose=verbose)
     click.echo("Model accuracy: ", nl=False)
     click.secho(f"{accuracy:.2%}", fg="blue")
     click.echo("Model saved to: ", nl=False)
+    joblib.dump(trained_model, model_path, compress=3)
     click.secho(str(model_path), fg="blue")
 def cli_wrapper() -> None:
     cli(max_content_width=120)

app/data.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from __future__ import annotations
 import bz2
-from typing import Literal
 import pandas as pd
 from app.constants import (
     AMAZONREVIEWS_PATH,
@@ -12,9 +14,76 @@ from app.constants import (
     IMDB50K_URL,
     SENTIMENT140_PATH,
     SENTIMENT140_URL,
 )
-__all__ = ["load_data"]
 def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
@@ -104,9 +173,6 @@ def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
     # Split the data into labels and text
     labels, texts = zip(*(line.split(" ", 1) for line in dataset))  # NOTE: Occasionally OOM
-    # Free up memory
-    del dataset
     # Map sentiment values
     sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
@@ -147,7 +213,48 @@ def load_imdb50k() -> tuple[list[str], list[int]]:
     return data["review"].tolist(), data["sentiment"].tolist()
-def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k"]) -> tuple[list[str], list[int]]:
     """Load and preprocess the specified dataset.
     Args:
@@ -166,6 +273,8 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k"]) -> t
             return load_amazonreviews(merge=True)
         case "imdb50k":
             return load_imdb50k()
         case _:
             msg = f"Unknown dataset: {dataset}"
             raise ValueError(msg)

 from __future__ import annotations
 import bz2
+from typing import TYPE_CHECKING, Literal
 import pandas as pd
+import spacy
+from tqdm import tqdm
 from app.constants import (
     AMAZONREVIEWS_PATH,
     IMDB50K_URL,
     SENTIMENT140_PATH,
     SENTIMENT140_URL,
+    TEST_DATASET_PATH,
+    TEST_DATASET_URL,
 )
+if TYPE_CHECKING:
+    from spacy.tokens import Doc
+__all__ = ["load_data", "tokenize"]
+try:
+    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
+except OSError:
+    print("Downloading spaCy model...")
+    from spacy.cli import download as spacy_download
+    spacy_download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
+def _lemmatize(doc: Doc, threshold: int = 2) -> list[str]:
+    """Lemmatize the provided text using spaCy.
+    Args:
+        doc: spaCy document
+        threshold: Minimum character length of tokens
+    Returns:
+        Lemmatized text
+    """
+    return [
+        token.lemma_.lower().strip()
+        for token in doc
+        if not token.is_stop
+        and not token.is_punct
+        and not token.like_email
+        and not token.like_url
+        and not token.like_num
+        and not (len(token.lemma_) < threshold)
+    ]
+def tokenize(
+    text_data: list[str],
+    batch_size: int = 512,
+    n_jobs: int = 4,
+    character_threshold: int = 2,
+    show_progress: bool = True,
+) -> list[list[str]]:
+    """Tokenize the provided text using spaCy.
+    Args:
+        text_data: Text data to tokenize
+        batch_size: Batch size for tokenization
+        n_jobs: Number of parallel jobs
+        character_threshold: Minimum character length of tokens
+        show_progress: Whether to show a progress bar
+    Returns:
+        Tokenized text data
+    """
+    return [
+        _lemmatize(doc, character_threshold)
+        for doc in tqdm(
+            nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
+            total=len(text_data),
+            disable=not show_progress,
+        )
+    ]
 def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
     # Split the data into labels and text
     labels, texts = zip(*(line.split(" ", 1) for line in dataset))  # NOTE: Occasionally OOM
     # Map sentiment values
     sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
     return data["review"].tolist(), data["sentiment"].tolist()
+def load_test(include_neutral: bool = False) -> tuple[list[str], list[int]]:
+    """Load the test dataset and make it suitable for use.
+    Args:
+        include_neutral: Whether to include neutral sentiment
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    if not TEST_DATASET_PATH.exists():
+        msg = (
+            f"Test dataset not found at: '{TEST_DATASET_PATH}'\n"
+            "Please download the dataset from:\n"
+            f"{TEST_DATASET_URL}"
+        )
+        raise FileNotFoundError(msg)
+    # Load the dataset
+    data = pd.read_csv(TEST_DATASET_PATH)
+    # Ignore rows with neutral sentiment
+    if not include_neutral:
+        data = data[data["label"] != 1]
+    # Map sentiment values
+    data["label"] = data["label"].map(
+        {
+            0: 0,  # Negative
+            1: 1,  # Neutral
+            2: 2,  # Positive
+        },
+    )
+    # Return as lists
+    return data["text"].tolist(), data["label"].tolist()
+def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
     """Load and preprocess the specified dataset.
     Args:
             return load_amazonreviews(merge=True)
         case "imdb50k":
             return load_imdb50k()
+        case "test":
+            return load_test(include_neutral=False)
         case _:
             msg = f"Unknown dataset: {dataset}"
             raise ValueError(msg)

app/gui.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import TYPE_CHECKING
 import gradio as gr
 import joblib
 if TYPE_CHECKING:
     from sklearn.base import BaseEstimator
@@ -31,7 +33,7 @@ def load_model() -> BaseEstimator:
 def sentiment_analysis(text: str) -> str:
     """Perform sentiment analysis on the provided text."""
     model = load_model()
-    prediction = model.predict([text])[0]
     if prediction == 0:
         return NEGATIVE_LABEL
@@ -52,6 +54,7 @@ demo = gr.Interface(
         ["The movie we watched was boring."],
         ["This website is amazing!"],
     ],
 )

 import gradio as gr
 import joblib
+from app.model import infer_model
 if TYPE_CHECKING:
     from sklearn.base import BaseEstimator
 def sentiment_analysis(text: str) -> str:
     """Perform sentiment analysis on the provided text."""
     model = load_model()
+    prediction = infer_model(model, [text])[0]
     if prediction == 0:
         return NEGATIVE_LABEL
         ["The movie we watched was boring."],
         ["This website is amazing!"],
     ],
+    allow_flagging=False,
 )

app/model.py CHANGED Viewed

@@ -1,85 +1,25 @@
 from __future__ import annotations
-import warnings
 import numpy as np
-import spacy
 from joblib import Memory
-from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
-from tqdm import tqdm
 from app.constants import CACHE_DIR
-__all__ = ["create_model", "train_model", "evaluate_model"]
-try:
-    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
-except OSError:
-    print("Downloading spaCy model...")
-    from spacy.cli import download as spacy_download
-    spacy_download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "ner"])
-class TextTokenizer(BaseEstimator, TransformerMixin):
-    def __init__(
-        self,
-        *,
-        character_threshold: int = 2,
-        batch_size: int = 1024,
-        n_jobs: int = 8,
-        progress: bool = True,
-    ) -> None:
-        self.character_threshold = character_threshold
-        self.batch_size = batch_size
-        self.n_jobs = n_jobs
-        self.progress = progress
-    def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextTokenizer:
-        return self
-    def transform(self, data: list[str]) -> list[list[str]]:
-        tokenized = []
-        for doc in tqdm(
-            nlp.pipe(data, batch_size=self.batch_size, n_process=self.n_jobs),
-            total=len(data),
-            disable=not self.progress,
-        ):
-            tokens = []
-            for token in doc:
-                # Ignore stop words and punctuation
-                if token.is_stop or token.is_punct:
-                    continue
-                # Ignore emails, URLs and numbers
-                if token.like_email or token.like_email or token.like_num:
-                    continue
-                # Lemmatize and lowercase
-                tok = token.lemma_.lower().strip()
-                # Format hashtags
-                if tok.startswith("#"):
-                    tok = tok[1:]
-                # Ignore short and non-alphanumeric tokens
-                if len(tok) < self.character_threshold or not tok.isalnum():
-                    continue
-                # TODO: Emoticons and emojis
-                # TODO: Spelling correction
-                tokens.append(tok)
-            tokenized.append(tokens)
-        return tokenized
-def identity(x: list[str]) -> list[str]:
     """Identity function for use in TfidfVectorizer.
     Args:
@@ -101,22 +41,21 @@ def create_model(
     Args:
         max_features: Maximum number of features
         seed: Random seed (None for random seed)
-        verbose: Whether to log progress during training
     Returns:
         Untrained model
     """
     return Pipeline(
         [
-            ("tokenizer", TextTokenizer(progress=True)),
             (
                 "vectorizer",
                 TfidfVectorizer(
                     max_features=max_features,
                     ngram_range=(1, 2),
                     # disable text processing
-                    tokenizer=identity,
-                    preprocessor=identity,
                     lowercase=False,
                     token_pattern=None,
                 ),
@@ -130,23 +69,27 @@ def create_model(
 def train_model(
     model: BaseEstimator,
-    text_data: list[str],
     label_data: list[int],
     seed: int = 42,
 ) -> tuple[BaseEstimator, float]:
     """Train the sentiment analysis model.
     Args:
         model: Untrained model
-        text_data: Text data
         label_data: Label data
         seed: Random seed (None for random seed)
     Returns:
         Trained model and accuracy
     """
     text_train, text_test, label_train, label_test = train_test_split(
-        text_data,
         label_data,
         test_size=0.2,
         random_state=seed,
@@ -154,50 +97,82 @@ def train_model(
     param_distributions = {
         "classifier__C": np.logspace(-4, 4, 20),
-        "classifier__penalty": ["l1", "l2"],
     }
     search = RandomizedSearchCV(
         model,
         param_distributions,
         n_iter=10,
-        cv=5,
         scoring="accuracy",
         random_state=seed,
         n_jobs=-1,
     )
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        # model.fit(text_train, label_train)
-        search.fit(text_train, label_train)
     best_model = search.best_estimator_
     return best_model, best_model.score(text_test, label_test)
 def evaluate_model(
-    model: Pipeline,
-    text_data: list[str],
     label_data: list[int],
     folds: int = 5,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
     Args:
         model: Trained model
-        text_data: Text data
         label_data: Label data
         folds: Number of cross-validation folds
     Returns:
         Mean accuracy and standard deviation
     """
     scores = cross_val_score(
         model,
-        text_data,
         label_data,
         cv=folds,
         scoring="accuracy",
     )
     return scores.mean(), scores.std()

 from __future__ import annotations
+import os
+from typing import TYPE_CHECKING
 import numpy as np
 from joblib import Memory
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
 from app.constants import CACHE_DIR
+from app.data import tokenize
+if TYPE_CHECKING:
+    from sklearn.base import BaseEstimator
+__all__ = ["create_model", "train_model", "evaluate_model", "infer_model"]
+def _identity(x: list[str]) -> list[str]:
     """Identity function for use in TfidfVectorizer.
     Args:
     Args:
         max_features: Maximum number of features
         seed: Random seed (None for random seed)
+        verbose: Whether to output additional information
     Returns:
         Untrained model
     """
     return Pipeline(
         [
             (
                 "vectorizer",
                 TfidfVectorizer(
                     max_features=max_features,
                     ngram_range=(1, 2),
                     # disable text processing
+                    tokenizer=_identity,
+                    preprocessor=_identity,
                     lowercase=False,
                     token_pattern=None,
                 ),
 def train_model(
     model: BaseEstimator,
+    token_data: list[str],
     label_data: list[int],
+    folds: int = 5,
     seed: int = 42,
+    verbose: bool = False,
 ) -> tuple[BaseEstimator, float]:
     """Train the sentiment analysis model.
     Args:
         model: Untrained model
+        token_data: Tokenized text data
         label_data: Label data
+        folds: Number of cross-validation folds
         seed: Random seed (None for random seed)
+        verbose: Whether to output additional information
     Returns:
         Trained model and accuracy
     """
     text_train, text_test, label_train, label_test = train_test_split(
+        token_data,
         label_data,
         test_size=0.2,
         random_state=seed,
     param_distributions = {
         "classifier__C": np.logspace(-4, 4, 20),
+        "classifier__solver": ["liblinear", "saga"],
     }
     search = RandomizedSearchCV(
         model,
         param_distributions,
         n_iter=10,
+        cv=folds,
         scoring="accuracy",
         random_state=seed,
         n_jobs=-1,
+        verbose=verbose,
     )
+    os.environ["PYTHONWARNINGS"] = "ignore"
+    search.fit(text_train, label_train)
+    del os.environ["PYTHONWARNINGS"]
     best_model = search.best_estimator_
     return best_model, best_model.score(text_test, label_test)
 def evaluate_model(
+    model: BaseEstimator,
+    token_data: list[str],
     label_data: list[int],
     folds: int = 5,
+    verbose: bool = False,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
     Args:
         model: Trained model
+        token_data: Tokenized text data
         label_data: Label data
         folds: Number of cross-validation folds
+        verbose: Whether to output additional information
     Returns:
         Mean accuracy and standard deviation
     """
+    os.environ["PYTHONWARNINGS"] = "ignore"
     scores = cross_val_score(
         model,
+        token_data,
         label_data,
         cv=folds,
         scoring="accuracy",
+        n_jobs=-1,
+        verbose=verbose,
     )
+    del os.environ["PYTHONWARNINGS"]
     return scores.mean(), scores.std()
+def infer_model(
+    model: BaseEstimator,
+    text_data: list[str],
+    batch_size: int = 32,
+    n_jobs: int = 4,
+) -> list[int]:
+    """Predict the sentiment of the provided text documents.
+    Args:
+        model: Trained model
+        text_data: Text data
+        batch_size: Batch size for tokenization
+        n_jobs: Number of parallel jobs
+    Returns:
+        Predicted sentiments
+    """
+    tokens = tokenize(
+        text_data,
+        batch_size=batch_size,
+        n_jobs=n_jobs,
+        show_progress=False,
+    )
+    return model.predict(tokens)