Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on May 15

Commit

85ac990

•

1 Parent(s): 667fe9d

Completely change the structure of the project

Browse files

Files changed (15) hide show

.vscode/settings.json +1 -0
README.md +4 -0
app/__main__.py +6 -0
app/cli.py +144 -0
app/constants.py +27 -11
app/gui.py +39 -73
app/model.py +273 -113
app/utils.py +0 -164
deprecated/__init__.py +0 -0
deprecated/main.py +0 -44
deprecated/train.py +0 -152
justfile +4 -6
notebook.ipynb +152 -0
poetry.lock +0 -0
pyproject.toml +2 -1

.vscode/settings.json CHANGED Viewed

@@ -23,5 +23,6 @@
     "**/__pycache__": true,
     "**/.ruff_cache": true,
     "**/.venv": true,
   }
 }

     "**/__pycache__": true,
     "**/.ruff_cache": true,
     "**/.venv": true,
+    "**/.cache": true,
   }
 }

README.md CHANGED Viewed

@@ -7,6 +7,10 @@ Sentiment Analysis
 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands
 ### TODO
 - [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables

 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands
+### Datasets
+- [Sentiment140](https://www.kaggle.com/datasets/kazanova/sentiment140)
+- [IMDb](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
+- [Amazon Reviews](https://www.kaggle.com/datasets/bittlingmayer/amazonreviews)
 ### TODO
 - [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables

app/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from __future__ import annotations
+from app.cli import cli_wrapper as cli
+if __name__ == "__main__":
+    cli()

app/cli.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Literal
+import click
+__all__ = ["cli_wrapper"]
+ERROR_STR = click.style("ERROR", fg="red")
+DONE_STR = click.style("DONE", fg="green")
+POSITIVE_STR = click.style("POSITIVE", fg="green")
+NEUTRAL_STR = click.style("NEUTRAL", fg="yellow")
+NEGATIVE_STR = click.style("NEGATIVE", fg="red")
+@click.group()
+def cli() -> None: ...
+@cli.command()
+@click.option(
+    "--model",
+    "model_path",
+    required=True,
+    help="Path to the trained model",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
+)
+@click.option(
+    "--share/--no-share",
+    default=False,
+    help="Whether to create a shareable link",
+)
+def gui(model_path: Path, share: bool) -> None:
+    """Launch the Gradio GUI"""
+    from app.gui import launch_gui
+    launch_gui(model_path, share)
+@cli.command()
+@click.option(
+    "--model",
+    "model_path",
+    required=True,
+    help="Path to the trained model",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
+)
+@click.argument("text", nargs=-1)
+def predict(model_path: Path, text: list[str]) -> None:
+    """Perform sentiment analysis on the provided text.
+    Note: Piped input takes precedence over the text argument
+    """
+    import sys
+    import joblib
+    text = " ".join(text).strip()
+    if not sys.stdin.isatty():
+        piped_text = sys.stdin.read().strip()
+        text = piped_text or text
+    if not text:
+        click.echo(f"{ERROR_STR}: No text provided")
+        return
+    click.echo("Loading model... ", nl=False)
+    model = joblib.load(model_path)
+    click.echo(DONE_STR)
+    click.echo("Performing sentiment analysis... ", nl=False)
+    prediction = model.predict([text])[0]
+    if prediction == 0:
+        sentiment = NEGATIVE_STR
+    elif prediction == 1:
+        sentiment = POSITIVE_STR
+    else:
+        sentiment = NEUTRAL_STR
+    click.echo(sentiment)
+@cli.command()
+@click.option(
+    "--dataset",
+    required=True,
+    help="Dataset to train the model on",
+    type=click.Choice(["sentiment140", "amazonreviews", "imdb50k"]),
+)
+@click.option(
+    "--max-features",
+    default=20000,
+    help="Maximum number of features",
+    show_default=True,
+    type=click.IntRange(1, None),
+)
+@click.option(
+    "--seed",
+    default=42,
+    help="Random seed (-1 for random seed)",
+    show_default=True,
+    type=click.IntRange(-1, None),
+)
+def train(
+    dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
+    max_features: int,
+    seed: int,
+) -> None:
+    """Train the model on the provided dataset"""
+    import joblib
+    from app.constants import MODELS_DIR
+    from app.model import create_model, load_data, train_model
+    model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
+    if model_path.exists():
+        click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
+    click.echo("Preprocessing dataset... ", nl=False)
+    text_data, label_data = load_data(dataset)
+    click.echo(DONE_STR)
+    click.echo("Creating model... ", nl=False)
+    model = create_model(max_features, seed=None if seed == -1 else seed)
+    click.echo(DONE_STR)
+    click.echo("Training model... ", nl=False)
+    accuracy = train_model(model, text_data, label_data)
+    joblib.dump(model, model_path)
+    click.echo(DONE_STR)
+    click.echo("Model accuracy: ")
+    click.secho(f"{accuracy:.2%}", fg="blue")
+    # TODO: Add hyperparameter options
+    # TODO: Random/grid search for finding best classifier and hyperparameters
+def cli_wrapper() -> None:
+    cli(max_content_width=120)
+if __name__ == "__main__":
+    cli_wrapper()

app/constants.py CHANGED Viewed

@@ -1,16 +1,32 @@
 from pathlib import Path
-DEFAULT_SEED: int = 42
-MAX_TOKENIZER_FEATURES: int = 500000
-CLF_MAX_ITER: int = 1000
-DATASET_PATH: Path = Path("data/training.1600000.processed.noemoticon.csv")
-STOPWORDS_PATH: Path = Path("data/stopwords-en.txt")
-MODELS_DIR: Path = Path("models")
-CACHE_DIR: Path = Path("cache")
-CHECKPOINT_PATH: Path = CACHE_DIR / "pipeline.pkl"
-# Create directories if they don't exist
-MODELS_DIR.mkdir(parents=True, exist_ok=True)
-CACHE_DIR.mkdir(parents=True, exist_ok=True)

+from __future__ import annotations
+import os
 from pathlib import Path
+CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
+DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
+MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
+SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
+SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
+AMAZONREVIEWS_PATH = (DATA_DIR / "amazonreviews.test.txt.bz2", DATA_DIR / "amazonreviews.train.txt.bz2")
+AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
+IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
+IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
+URL_REGEX = r"(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?\/[a-zA-Z0-9]{2,}|((https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?)|(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}(\.[a-zA-Z0-9]{2,})?"  # https://www.freecodecamp.org/news/how-to-write-a-regular-expression-for-a-url/
+EMOTICON_MAP = {
+    "SMILE": [":)", ":-)", ": )", ":D", ":-D", ": D", ";)", ";-)", "; )", ":>", ":->", ": >", ":]", ":-]", ": ]"],
+    "LOVE": ["<3", ":*", ":-*", ": *"],
+    "WINK": [";)", ";-)", "; )", ";>", ";->", "; >"],
+    "FROWN": [":(", ":-(", ": (", ":[", ":-[", ": ["],
+    "CRY": [":'(", ": (", ":' (", ":'[", ":' ["],
+    "SURPRISE": [":O", ":-O", ": O", ":0", ":-0", ": 0", ":o", ":-o", ": o"],
+    "ANGRY": [">:(", ">:-(", "> :(", ">:["],
+}
+CACHE_DIR.mkdir(exist_ok=True, parents=True)
+DATA_DIR.mkdir(exist_ok=True, parents=True)
+MODELS_DIR.mkdir(exist_ok=True, parents=True)

app/gui.py CHANGED Viewed

@@ -1,92 +1,58 @@
 from __future__ import annotations
-from pathlib import Path
 import gradio as gr
-from constants import MODELS_DIR
-from model import predict, tokenize
-CSS_PATH = Path("style.css")
-TOKENIZER_EXT = ".tokenizer.pkl"
-MODEL_EXT = ".model.pkl"
-POSITIVE_LABEL = "Positive 😊"
-NEGATIVE_LABEL = "Negative 😤"
-REFRESH_SYMBOL = "🔄"
-def load_style() -> str:
-    if not CSS_PATH.is_file():
-        return ""
-    with Path.open(CSS_PATH) as f:
-        return f.read()
-def predict_wrapper(text: str, tokenizer: str, model: str) -> str:
-    toks = tokenize(text, MODELS_DIR / f"{tokenizer}{TOKENIZER_EXT}")
-    pred = predict(toks, MODELS_DIR / f"{model}{MODEL_EXT}")
-    return POSITIVE_LABEL if pred else NEGATIVE_LABEL
-def train_wrapper() -> None:
-    msg = "Training is not supported in the GUI."
-    raise NotImplementedError(msg)
-def evaluate_wrapper() -> None:
-    msg = "Evaluation is not supported in the GUI."
-    raise NotImplementedError(msg)
-with gr.Blocks(css=load_style()) as demo:
-    gr.Markdown("## Sentiment Analysis")
-    with gr.Row(equal_height=True):
-        textbox = gr.Textbox(
-            lines=10,
-            label="Enter text to analyze",
-            placeholder="Enter text here",
-            key="input-textbox",
-        )
-        with gr.Column():
-            output = gr.Label()
-            with gr.Row(elem_classes="justify-between"):
-                clear_btn = gr.ClearButton([textbox, output], value="Clear 🧹")
-                analyze_btn = gr.Button(
-                    "Analyze 🔍",
-                    variant="primary",
-                    interactive=False,
-                )
-            with gr.Row():
-                tokenizer_selector = gr.Dropdown(
-                    choices=[tkn.stem[: -len(".tokenizer")] for tkn in MODELS_DIR.glob(f"*{TOKENIZER_EXT}")],
-                    label="Tokenizer",
-                    key="tokenizer-selector",
-                )
-                model_selector = gr.Dropdown(
-                    choices=[mdl.stem[: -len(".model")] for mdl in MODELS_DIR.glob(f"*{MODEL_EXT}")],
-                    label="Model",
-                    key="model-selector",
-                )
-                # TODO: Refresh button
-    # Event handlers
-    textbox.input(
-        fn=lambda text: gr.update(interactive=bool(text.strip())),
-        inputs=[textbox],
-        outputs=[analyze_btn],
-    )
-    analyze_btn.click(
-        fn=predict_wrapper,
-        inputs=[textbox, tokenizer_selector, model_selector],
-        outputs=[output],
-    )
-demo.queue()
-demo.launch()

 from __future__ import annotations
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING
 import gradio as gr
+import joblib
+if TYPE_CHECKING:
+    from sklearn.pipeline import Pipeline
+__all__ = ["launch_gui"]
+POSITIVE_LABEL = "Positive 😊"
+NEUTRAL_LABEL = "Neutral 😐"
+NEGATIVE_LABEL = "Negative 😤"
+@lru_cache(maxsize=1)
+def load_model() -> Pipeline:
+    """Load the trained model and cache it."""
+    model_path = os.environ.get("MODEL_PATH", None)
+    if model_path is None:
+        msg = "MODEL_PATH environment variable not set"
+        raise ValueError(msg)
+    return joblib.load(model_path)
+def sentiment_analysis(text: str) -> str:
+    """Perform sentiment analysis on the provided text."""
+    model = load_model()
+    prediction = model.predict([text])[0]
+    if prediction == 0:
+        return NEGATIVE_LABEL
+    if prediction == 1:
+        return POSITIVE_LABEL
+    return NEUTRAL_LABEL
+demo = gr.Interface(
+    fn=sentiment_analysis,
+    inputs="text",
+    outputs="label",
+    title="Sentiment Analysis",
+)
+def launch_gui(model_path: str, share: bool) -> None:
+    """Launch the Gradio GUI."""
+    os.environ["MODEL_PATH"] = model_path
+    demo.launch(share=share)
+if __name__ == "__main__":
+    demo.launch()

app/model.py CHANGED Viewed

@@ -1,144 +1,304 @@
 from __future__ import annotations
 import warnings
-from functools import lru_cache
-from typing import TYPE_CHECKING, Sequence
-import joblib
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
-from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
-from utils import get_cache_memory, get_random_state
-if TYPE_CHECKING:
-    from pathlib import Path
-    from numpy import ndarray
-    from numpy.random import RandomState
-__all__ = ["predict", "tokenize"]
-@lru_cache(maxsize=1)
-def get_model(model_path: Path) -> Pipeline:
-    return joblib.load(model_path)
-@lru_cache(maxsize=1)
-def get_tokenizer(tokenizer_path: Path) -> Pipeline:
-    return joblib.load(tokenizer_path)
-def export_to_file(pipeline: Pipeline, path: Path) -> None:
-    joblib.dump(pipeline, path)
-def tokenize(text: str, tokenizer_path: Path) -> ndarray:
-    tokenizer = get_tokenizer(tokenizer_path)
-    return tokenizer.transform([text])[0]
-def predict(tokens: ndarray, model_path: Path) -> bool:
-    model = get_model(model_path)
-    prediction = model.predict([tokens])
-    return prediction[0] == 1
-def train_and_export(
-    steps: Sequence[tuple],
-    x: list[str],
-    y: list[int],
-    export_path: Path,
-    cache: joblib.Memory,
 ) -> Pipeline:
-    pipeline = Pipeline(steps, memory=cache)
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        pipeline.fit(x, y)
-    export_to_file(pipeline, export_path)
-    return pipeline
-def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
-    return train_and_export(
         [
-            (
-                "vectorize",
-                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
-            ),
             ("tfidf", TfidfTransformer()),
         ],
-        x,
-        y,
-        export_path,
-        cache,
     )
-def train_model_and_export(
-    x: ndarray,
-    y: list[int],
-    export_path: Path,
-    cache: joblib.Memory,
-    rs: RandomState,
-) -> Pipeline:
-    return train_and_export(
-        [("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
-        x,
-        y,
-        export_path,
-        cache,
-    )
-def train(x: list[str], y: list[int]) -> Pipeline:
-    cache = get_cache_memory()
-    rs = get_random_state()
-    tokenizer = train_tokenizer(x, y, cache)
-    x_tr = tokenizer.transform(x)
-    model = train_model(x_tr, y, cache, rs)
-    return Pipeline([("tokenizer", tokenizer), ("model", model)])
-def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
-    # TODO: In the future, allow for different tokenizers
-    pipeline = Pipeline(
-        [
-            (
-                "vectorize",
-                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
-            ),
-            ("tfidf", TfidfTransformer()),
-        ],
-        memory=cache,
     )
     with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Ignore joblib warnings
-        pipeline.fit(x, y)
-    return pipeline
-def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
-    # TODO: In the future, allow for different classifiers
-    pipeline = Pipeline(
-        [
-            ("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
-        ],
-        memory=cache,
-    )
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Ignore joblib warnings
-        pipeline.fit(x, y)
-    return pipeline

 from __future__ import annotations
+import bz2
+import re
 import warnings
+from typing import Literal
+import pandas as pd
+from joblib import Memory
+from nltk.stem import WordNetLemmatizer
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
+from app.constants import (
+    AMAZONREVIEWS_PATH,
+    AMAZONREVIEWS_URL,
+    CACHE_DIR,
+    EMOTICON_MAP,
+    IMDB50K_PATH,
+    IMDB50K_URL,
+    SENTIMENT140_PATH,
+    SENTIMENT140_URL,
+    URL_REGEX,
+)
+__all__ = ["load_data", "create_model", "train_model"]
+class TextCleaner(BaseEstimator, TransformerMixin):
+    def __init__(
+        self,
+        *,
+        replace_url: bool = True,
+        replace_hashtag: bool = True,
+        replace_emoticon: bool = True,
+        replace_emoji: bool = True,
+        lowercase: bool = True,
+        character_threshold: int = 2,
+        remove_special_characters: bool = True,
+        remove_extra_spaces: bool = True,
+    ):
+        self.replace_url = replace_url
+        self.replace_hashtag = replace_hashtag
+        self.replace_emoticon = replace_emoticon
+        self.replace_emoji = replace_emoji
+        self.lowercase = lowercase
+        self.character_threshold = character_threshold
+        self.remove_special_characters = remove_special_characters
+        self.remove_extra_spaces = remove_extra_spaces
+    def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextCleaner:
+        return self
+    def transform(self, data: list[str], _labels: list[int] | None = None) -> list[str]:
+        # Replace URLs, hashtags, emoticons, and emojis
+        data = [re.sub(URL_REGEX, "URL", text) for text in data] if self.replace_url else data
+        data = [re.sub(r"#\w+", "HASHTAG", text) for text in data] if self.replace_hashtag else data
+        # Replace emoticons
+        if self.replace_emoticon:
+            for word, emoticons in EMOTICON_MAP.items():
+                for emoticon in emoticons:
+                    data = [text.replace(emoticon, f"EMOTE_{word}") for text in data]
+        # Basic text cleaning
+        data = [text.lower() for text in data] if self.lowercase else data  # Lowercase
+        threshold_pattern = re.compile(rf"\b\w{{1,{self.character_threshold}}}\b")
+        data = (
+            [re.sub(threshold_pattern, "", text) for text in data] if self.character_threshold > 0 else data
+        )  # Remove short words
+        data = (
+            [re.sub(r"[^a-zA-Z0-9\s]", "", text) for text in data] if self.remove_special_characters else data
+        )  # Remove special characters
+        data = [re.sub(r"\s+", " ", text) for text in data] if self.remove_extra_spaces else data  # Remove extra spaces
+        # Remove leading and trailing whitespace
+        return [text.strip() for text in data]
+class TextLemmatizer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.lemmatizer = WordNetLemmatizer()
+    def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextLemmatizer:
+        return self
+    def transform(self, data: list[str], _labels: list[int] | None = None) -> list[str]:
+        return [self.lemmatizer.lemmatize(text) for text in data]
+def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
+    """Load the sentiment140 dataset and make it suitable for use.
+    Args:
+        include_neutral: Whether to include neutral sentiment
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    if not SENTIMENT140_PATH.exists():
+        msg = (
+            f"Sentiment140 dataset not found at: '{SENTIMENT140_PATH}'\n"
+            "Please download the dataset from:\n"
+            f"{SENTIMENT140_URL}"
+        )
+        raise FileNotFoundError(msg)
+    # Load the dataset
+    data = pd.read_csv(
+        SENTIMENT140_PATH,
+        encoding="ISO-8859-1",
+        names=[
+            "target",  # 0 = negative, 2 = neutral, 4 = positive
+            "id",  # The id of the tweet
+            "date",  # The date of the tweet
+            "flag",  # The query, NO_QUERY if not present
+            "user",  # The user that tweeted
+            "text",  # The text of the tweet
+        ],
+    )
+    # Ignore rows with neutral sentiment
+    if not include_neutral:
+        data = data[data["target"] != 2]
+    # Map sentiment values
+    data["sentiment"] = data["target"].map(
+        {
+            0: 0,  # Negative
+            4: 1,  # Positive
+            2: 2,  # Neutral
+        },
+    )
+    # Return as lists
+    return data["text"].tolist(), data["sentiment"].tolist()
+def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
+    """Load the amazonreviews dataset and make it suitable for use.
+    Args:
+        merge: Whether to merge the test and train datasets (otherwise ignore test)
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    test_exists = AMAZONREVIEWS_PATH[0].exists() or not merge
+    train_exists = AMAZONREVIEWS_PATH[1].exists()
+    if not (test_exists and train_exists):
+        msg = (
+            f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH[0]}' and '{AMAZONREVIEWS_PATH[1]}'\n"
+            "Please download the dataset from:\n"
+            f"{AMAZONREVIEWS_URL}"
+        )
+        raise FileNotFoundError(msg)
+    # Load the datasets
+    with bz2.BZ2File(AMAZONREVIEWS_PATH[1]) as train_file:
+        train_data = [line.decode("utf-8") for line in train_file]
+    test_data = []
+    if merge:
+        with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
+            test_data = [line.decode("utf-8") for line in test_file]
+    # Merge the datasets
+    data = train_data + test_data
+    # Split the data into labels and text
+    labels, texts = zip(*(line.split(" ", 1) for line in data))
+    # Map sentiment values
+    sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
+    # Return as lists
+    return texts, sentiments
+def load_imdb50k() -> tuple[list[str], list[int]]:
+    """Load the imdb50k dataset and make it suitable for use.
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    if not IMDB50K_PATH.exists():
+        msg = (
+            f"IMDB50K dataset not found at: '{IMDB50K_PATH}'\n"
+            "Please download the dataset from:\n"
+            f"{IMDB50K_URL}"
+        )  # fmt: off
+        raise FileNotFoundError(msg)
+    # Load the dataset
+    data = pd.read_csv(IMDB50K_PATH)
+    # Map sentiment values
+    data["sentiment"] = data["sentiment"].map(
+        {
+            "positive": 1,
+            "negative": 0,
+        },
+    )
+    # Return as lists
+    return data["review"].tolist(), data["sentiment"].tolist()
+def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k"]) -> tuple[list[str], list[int]]:
+    """Load and preprocess the specified dataset.
+    Args:
+        dataset: Dataset to load
+    Returns:
+        Text and label data
+    Raises:
+        ValueError: If the dataset is not recognized
+    """
+    match dataset:
+        case "sentiment140":
+            return load_sentiment140(include_neutral=False)
+        case "amazonreviews":
+            return load_amazonreviews(merge=True)
+        case "imdb50k":
+            return load_imdb50k()
+        case _:
+            msg = f"Unknown dataset: {dataset}"
+            raise ValueError(msg)
+def create_model(
+    max_features: int,
+    seed: int | None = None,
 ) -> Pipeline:
+    """Create a sentiment analysis model.
+    Args:
+        max_features: Maximum number of features
+        seed: Random seed (None for random seed)
+    Returns:
+        Untrained model
+    """
+    return Pipeline(
         [
+            # Text preprocessing
+            ("clean", TextCleaner()),
+            ("lemma", TextLemmatizer()),
+            # Preprocess (NOTE: Can be replaced with TfidfVectorizer, but left for clarity)
+            ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=max_features)),
             ("tfidf", TfidfTransformer()),
+            # Classifier
+            ("clf", LogisticRegression(max_iter=1000, random_state=seed)),
         ],
+        memory=Memory(CACHE_DIR, verbose=0),
     )
+def train_model(
+    model: Pipeline,
+    text_data: list[str],
+    label_data: list[int],
+    seed: int = 42,
+) -> float:
+    """Train the sentiment analysis model.
+    Args:
+        model: Untrained model
+        text_data: Text data
+        label_data: Label data
+        seed: Random seed (None for random seed)
+    Returns:
+        Accuracy score
+    """
+    text_train, text_test, label_train, label_test = train_test_split(
+        text_data,
+        label_data,
+        test_size=0.2,
+        random_state=seed,
     )
     with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        model.fit(text_train, label_train)
+    return model.score(text_test, label_test)

app/utils.py DELETED Viewed

@@ -1,164 +0,0 @@
-"""Utility functions"""
-from __future__ import annotations
-import itertools
-import re
-import warnings
-from collections import deque
-from enum import Enum
-from functools import lru_cache
-from threading import Event, Lock
-from typing import Any
-from joblib import Memory
-from numpy.random import RandomState
-from constants import CACHE_DIR, DEFAULT_SEED
-__all__ = ["colorize", "wrap_queued_call", "get_random_state", "get_cache_memory"]
-ANSI_RESET = 0
-class Color(Enum):
-    """ANSI color codes."""
-    BLACK = 30
-    RED = 31
-    GREEN = 32
-    YELLOW = 33
-    BLUE = 34
-    MAGENTA = 35
-    CYAN = 36
-    WHITE = 37
-class Style(Enum):
-    """ANSI style codes."""
-    BOLD = 1
-    DIM = 2
-    ITALIC = 3
-    UNDERLINE = 4
-    BLINK = 5
-    INVERTED = 7
-    HIDDEN = 8
-# https://gist.github.com/vitaliyp/6d54dd76ca2c3cdfc1149d33007dc34a
-class FIFOLock:
-    def __init__(self):
-        self._lock = Lock()
-        self._inner_lock = Lock()
-        self._pending_threads = deque()
-    def acquire(self, blocking: bool = True) -> bool:
-        with self._inner_lock:
-            lock_acquired = self._lock.acquire(False)
-            if lock_acquired:
-                return True
-            if not blocking:
-                return False
-            release_event = Event()
-            self._pending_threads.append(release_event)
-        release_event.wait()
-        return self._lock.acquire()
-    def release(self) -> None:
-        with self._inner_lock:
-            if self._pending_threads:
-                release_event = self._pending_threads.popleft()
-                release_event.set()
-            self._lock.release()
-    __enter__ = acquire
-    def __exit__(self, _t, _v, _tb):  # noqa: ANN001
-        self.release()
-@lru_cache(maxsize=1)
-def get_queue_lock() -> FIFOLock:
-    return FIFOLock()
-@lru_cache(maxsize=1)
-def get_random_state(seed: int = DEFAULT_SEED) -> RandomState:
-    return RandomState(seed)
-@lru_cache(maxsize=1)
-def get_cache_memory() -> Memory:
-    return Memory(CACHE_DIR, verbose=0)
-def to_ansi(code: int) -> str:
-    """Convert an integer to an ANSI escape code."""
-    return f"\033[{code}m"
-@lru_cache(maxsize=None)
-def get_ansi_color(color: Color, bright: bool = False, background: bool = False) -> str:
-    """Get ANSI color code for the specified color, brightness and background."""
-    code = color.value
-    if bright:
-        code += 60
-    if background:
-        code += 10
-    return to_ansi(code)
-def replace_color_tag(color: Color, text: str) -> None:
-    """Replace both dark and light color tags for background and foreground."""
-    for bright, bg in itertools.product([False, True], repeat=2):
-        tag = f"{'BG_' if bg else ''}{'BRIGHT_' if bright else ''}{color.name}"
-        text = text.replace(f"[{tag}]", get_ansi_color(color, bright=bright, background=bg))
-        text = text.replace(f"[/{tag}]", to_ansi(ANSI_RESET))
-    return text
-@lru_cache(maxsize=256)
-def colorize(text: str, strip: bool = True) -> str:
-    """Format text with ANSI color codes using tags [COLOR], [BG_COLOR] and [STYLE].
-    Reset color/style with [/TAG].
-    Escape with double brackets [[]]. Strip leading and trailing whitespace if strip=True.
-    """
-    # replace foreground and background color tags
-    for color in Color:
-        text = replace_color_tag(color, text)
-    # replace style tags
-    for style in Style:
-        text = text.replace(f"[{style.name}]", to_ansi(style.value)).replace(f"[/{style.name}]", to_ansi(ANSI_RESET))
-    # if there are any tags left, remove them and throw a warning
-    pat1 = re.compile(r"((?<!\[)\[)([^\[\]]*)(\](?!\]))")
-    for match in pat1.finditer(text):
-        color = match.group(1)
-        text = text.replace(match.group(0), "")
-        warnings.warn(f"Invalid color tag: {color!r}", UserWarning, stacklevel=2)
-    # escape double brackets
-    pat2 = re.compile(r"\[\[[^\[\]\v]+\]\]")
-    text = pat2.sub("", text)
-    # reset color/style at the end
-    text += to_ansi(ANSI_RESET)
-    return text.strip() if strip else text
-# https://github.com/AUTOMATIC1111/stable-diffusion-webui/modules/call_queue.py
-def wrap_queued_call(func: callable) -> callable:
-    def f(*args, **kwargs) -> Any:  # noqa: ANN003, ANN002
-        with get_queue_lock():
-            return func(*args, **kwargs)
-    return f

deprecated/__init__.py DELETED Viewed

File without changes

deprecated/main.py DELETED Viewed

@@ -1,44 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-import click
-import joblib
-from app.utils import colorize
-@click.group()
-def cli() -> None: ...
-@cli.command("predict")
-@click.option(
-    "-m",
-    "--model",
-    "model_path",
-    default="models/model.pkl",
-    help="Path to the model file.",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
-)
-@click.argument("text", nargs=-1)
-def predict(model_path: Path, text: list[str]) -> None:
-    input_text = " ".join(text).strip()
-    if not input_text:
-        click.echo("[RED]Error[/RED]: Input text is empty.")
-        return
-    # Load the model
-    click.echo("Loading model... ", nl=False)
-    model = joblib.load(model_path)
-    click.echo(colorize("[GREEN]DONE"))
-    # Run the model
-    click.echo("Performing sentiment analysis... ", nl=False)
-    prediction = model.predict([input_text])
-    sentiment = "[GREEN]POSITIVE" if prediction[0] == 1 else "[RED]NEGATIVE"
-    click.echo(colorize(sentiment))
-if __name__ == "__main__":
-    cli()

deprecated/train.py DELETED Viewed

@@ -1,152 +0,0 @@
-from __future__ import annotations
-import warnings
-from pathlib import Path
-from typing import TYPE_CHECKING
-import click
-import joblib
-import pandas as pd
-from numpy.random import RandomState
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, classification_report
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import Pipeline
-if TYPE_CHECKING:
-    from sklearn.base import BaseEstimator
-SEED = 42
-DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
-STOPWORDS_PATH = Path("data/stopwords-en.txt")
-CHECKPOINT_PATH = Path("cache/pipeline.pkl")
-MODELS_DIR = Path("models")
-CACHE_DIR = Path("cache")
-MAX_FEATURES = 10000  # 500000
-# Make sure paths exist
-MODELS_DIR.mkdir(parents=True, exist_ok=True)
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-# Memory cache for sklearn pipelines
-mem = joblib.Memory(CACHE_DIR, verbose=0)
-# TODO: use xgboost
-def get_random_state(seed: int = SEED) -> RandomState:
-    return RandomState(seed)
-def load_data() -> tuple[list[str], list[int]]:
-    """The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
-    data = pd.read_csv(
-        DATASET_PATH,
-        encoding="ISO-8859-1",
-        names=[
-            "target",  # 0 = negative, 2 = neutral, 4 = positive
-            "id",  # The id of the tweet
-            "date",  # The date of the tweet
-            "flag",  # The query, NO_QUERY if not present
-            "user",  # The user that tweeted
-            "text",  # The text of the tweet
-        ],
-    )
-    # Ignore rows with neutral sentiment
-    data = data[data["target"] != 2]
-    # Create new column called "sentiment" with 1 for positive and 0 for negative
-    data["sentiment"] = data["target"] == 4
-    # Drop the columns we don't need
-    # data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need
-    # Return as lists
-    return list(data["text"]), list(data["sentiment"])
-def create_pipeline(clf: BaseEstimator) -> Pipeline:
-    return Pipeline(
-        [
-            # Preprocess
-            # ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
-            # ("tfidf", TfidfTransformer()),
-            ("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
-            # Classifier
-            ("clf", clf),
-        ],
-        memory=mem,
-    )
-def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
-    y_pred = pipeline.predict(x)
-    report = classification_report(y, y_pred)
-    click.echo(report)
-    # TODO: Confusion matrix
-    return accuracy_score(y, y_pred)
-def export_pipeline(pipeline: Pipeline, name: str) -> None:
-    model_path = MODELS_DIR / f"{name}.pkl"
-    joblib.dump(pipeline, model_path)
-    click.echo(f"Model exported to {model_path!r}")
-@click.command()
-@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
-@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
-@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
-@click.option("--seed", type=int, default=SEED, help="Random seed.")
-def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
-    rng = get_random_state(seed)
-    # Clear sklearn cache
-    if flush_cache:
-        click.echo("Clearing cache... ", nl=False)
-        mem.clear(warn=False)
-        click.echo("DONE")
-    # Load and split data
-    click.echo("Loading data... ", nl=False)
-    x, y = load_data()
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
-    click.echo("DONE")
-    # Train model
-    if retrain or not CHECKPOINT_PATH.exists():
-        click.echo("Training model... ", nl=False)
-        clf = LogisticRegression(max_iter=1000, random_state=rng)
-        model = create_pipeline(clf)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")  # Ignore joblib warnings
-            model.fit(x_train, y_train)
-        joblib.dump(model, CHECKPOINT_PATH)
-        click.echo("DONE")
-    else:
-        click.echo("Loading model... ", nl=False)
-        model = joblib.load(CHECKPOINT_PATH)
-        click.echo("DONE")
-    # Evaluate model
-    if evaluate:
-        evaluate_pipeline(model, x_test, y_test)
-    # Quick test
-    test_text = ["I love this movie", "I hate this movie"]
-    click.echo("Quick test:")
-    for text in test_text:
-        click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")
-    # Export model
-    click.echo("Exporting model... ", nl=False)
-    export_pipeline(model, "logistic_regression")
-    click.echo("DONE")
-if __name__ == "__main__":
-    train()

justfile CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env just --justfile
 @default:
-  echo "No target specified."
 @lint:
   poetry run pre-commit run --all-files
@@ -16,8 +16,6 @@
 @requirements:
   poetry export -f requirements.txt --output requirements.txt --without dev
-@run +TEXT:
-  poetry run python main.py predict --model models/logistic_regression.pkl "{{TEXT}}"
-@gui:
-  poetry run gradio app/gui.py

 #!/usr/bin/env just --justfile
 @default:
+  just --list
 @lint:
   poetry run pre-commit run --all-files
 @requirements:
   poetry export -f requirements.txt --output requirements.txt --without dev
+[no-exit-message]
+@app *ARGS:
+  poetry run python -m app {{ARGS}}

notebook.ipynb ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sentiment Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "import re\n",
+    "from functools import cache\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data: pd.DataFrame = None  # TODO: load dataset\n",
+    "stopwords: set[str] = None  # TODO: load stopwords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the distribution\n",
+    "_, ax = plt.subplots(figsize=(6, 4))\n",
+    "data[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n",
+    "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n",
+    "ax.set_xlabel(\"Sentiment\")\n",
+    "ax.grid(False)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cache\n",
+    "def extract_words(text: str) -> list[str]:\n",
+    "    return re.findall(r\"(\\b[^\\s]+\\b)\", text.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract words and count them\n",
+    "words = data[\"text\"].apply(extract_words).explode()\n",
+    "word_counts = words.value_counts().reset_index()\n",
+    "word_counts.columns = [\"word\", \"count\"]\n",
+    "word_counts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the most common words\n",
+    "_, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
+    "\n",
+    "sns.barplot(data=word_counts.head(10), x=\"count\", y=\"word\", ax=ax1)\n",
+    "ax1.set_title(\"Most common words\")\n",
+    "ax1.grid(False)\n",
+    "ax1.tick_params(axis=\"x\", rotation=45)\n",
+    "\n",
+    "ax2.set_title(\"Most common words (excluding stopwords)\")\n",
+    "sns.barplot(\n",
+    "    data=word_counts[~word_counts[\"word\"].isin(stopwords)].head(10),\n",
+    "    x=\"count\",\n",
+    "    y=\"word\",\n",
+    "    ax=ax2,\n",
+    ")\n",
+    "ax2.grid(False)\n",
+    "ax2.tick_params(axis=\"x\", rotation=45)\n",
+    "ax2.set_ylabel(\"\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find best classifier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find best hyperparameters"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

poetry.lock CHANGED Viewed

Binary files a/poetry.lock and b/poetry.lock differ

pyproject.toml CHANGED Viewed

@@ -1,13 +1,14 @@
 [tool.poetry]
 name = "sentiment-analysis"
 package-mode = false
-packages = [{ include = "app" }]
 [tool.poetry.dependencies]
 python = "^3.12"
 click = "^8.1.7"
 scikit-learn = "^1.4.2"
 gradio = "^4.31.0"
 [tool.poetry.group.train.dependencies]
 pandas = "^2.2.2"

 [tool.poetry]
 name = "sentiment-analysis"
 package-mode = false
 [tool.poetry.dependencies]
 python = "^3.12"
 click = "^8.1.7"
 scikit-learn = "^1.4.2"
 gradio = "^4.31.0"
+colorama = "^0.4.6"
+nltk = "^3.8.1"
 [tool.poetry.group.train.dependencies]
 pandas = "^2.2.2"