Spaces:
Running
Running
from __future__ import annotations | |
import warnings | |
import en_core_web_sm | |
import numpy as np | |
from joblib import Memory | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split | |
from sklearn.pipeline import Pipeline | |
from tqdm import tqdm | |
from app.constants import CACHE_DIR | |
__all__ = ["create_model", "train_model", "evaluate_model"] | |
nlp = en_core_web_sm.load(disable=["tok2vec", "parser", "ner"]) | |
class TextTokenizer(BaseEstimator, TransformerMixin): | |
def __init__( | |
self, | |
*, | |
character_threshold: int = 2, | |
batch_size: int = 1024, | |
n_jobs: int = 8, | |
progress: bool = True, | |
) -> None: | |
self.character_threshold = character_threshold | |
self.batch_size = batch_size | |
self.n_jobs = n_jobs | |
self.progress = progress | |
def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextTokenizer: | |
return self | |
def transform(self, data: list[str]) -> list[list[str]]: | |
tokenized = [] | |
for doc in tqdm( | |
nlp.pipe(data, batch_size=self.batch_size, n_process=self.n_jobs), | |
total=len(data), | |
disable=not self.progress, | |
): | |
tokens = [] | |
for token in doc: | |
# Ignore stop words and punctuation | |
if token.is_stop or token.is_punct: | |
continue | |
# Ignore emails, URLs and numbers | |
if token.like_email or token.like_email or token.like_num: | |
continue | |
# Lemmatize and lowercase | |
tok = token.lemma_.lower().strip() | |
# Format hashtags | |
if tok.startswith("#"): | |
tok = tok[1:] | |
# Ignore short and non-alphanumeric tokens | |
if len(tok) < self.character_threshold or not tok.isalnum(): | |
continue | |
# TODO: Emoticons and emojis | |
# TODO: Spelling correction | |
tokens.append(tok) | |
tokenized.append(tokens) | |
return tokenized | |
def identity(x: list[str]) -> list[str]: | |
"""Identity function for use in TfidfVectorizer. | |
Args: | |
x: Input data | |
Returns: | |
Unchanged input data | |
""" | |
return x | |
def create_model( | |
max_features: int, | |
seed: int | None = None, | |
verbose: bool = False, | |
) -> Pipeline: | |
"""Create a sentiment analysis model. | |
Args: | |
max_features: Maximum number of features | |
seed: Random seed (None for random seed) | |
verbose: Whether to log progress during training | |
Returns: | |
Untrained model | |
""" | |
return Pipeline( | |
[ | |
("tokenizer", TextTokenizer(progress=True)), | |
( | |
"vectorizer", | |
TfidfVectorizer( | |
max_features=max_features, | |
ngram_range=(1, 2), | |
# disable text processing | |
tokenizer=identity, | |
preprocessor=identity, | |
lowercase=False, | |
token_pattern=None, | |
), | |
), | |
("classifier", LogisticRegression(max_iter=1000, C=1.0, random_state=seed)), | |
], | |
memory=Memory(CACHE_DIR, verbose=0), | |
verbose=verbose, | |
) | |
def train_model( | |
model: BaseEstimator, | |
text_data: list[str], | |
label_data: list[int], | |
seed: int = 42, | |
) -> tuple[BaseEstimator, float]: | |
"""Train the sentiment analysis model. | |
Args: | |
model: Untrained model | |
text_data: Text data | |
label_data: Label data | |
seed: Random seed (None for random seed) | |
Returns: | |
Trained model and accuracy | |
""" | |
text_train, text_test, label_train, label_test = train_test_split( | |
text_data, | |
label_data, | |
test_size=0.2, | |
random_state=seed, | |
) | |
param_distributions = { | |
"classifier__C": np.logspace(-4, 4, 20), | |
"classifier__penalty": ["l1", "l2"], | |
} | |
search = RandomizedSearchCV( | |
model, | |
param_distributions, | |
n_iter=10, | |
cv=5, | |
scoring="accuracy", | |
random_state=seed, | |
n_jobs=-1, | |
) | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
# model.fit(text_train, label_train) | |
search.fit(text_train, label_train) | |
best_model = search.best_estimator_ | |
return best_model, best_model.score(text_test, label_test) | |
def evaluate_model( | |
model: Pipeline, | |
text_data: list[str], | |
label_data: list[int], | |
folds: int = 5, | |
) -> tuple[float, float]: | |
"""Evaluate the model using cross-validation. | |
Args: | |
model: Trained model | |
text_data: Text data | |
label_data: Label data | |
folds: Number of cross-validation folds | |
Returns: | |
Mean accuracy and standard deviation | |
""" | |
scores = cross_val_score( | |
model, | |
text_data, | |
label_data, | |
cv=folds, | |
scoring="accuracy", | |
) | |
return scores.mean(), scores.std() | |