Spaces:
Running
Running
Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1 | """ | |
| PhilVerify — Bag of Words + Logistic Regression Classifier (Layer 1) | |
| CountVectorizer (BoW) with LogisticRegression. Identical to TFIDFClassifier except | |
| for the vectorizer — this isolates the BoW vs TF-IDF comparison in eval.py. | |
| Supports optional WordNet lemmatization. | |
| """ | |
| import logging | |
| from ml.naive_bayes_classifier import _lemmatize_tokens | |
| from ml.tfidf_classifier import Layer1Result | |
| logger = logging.getLogger(__name__) | |
| class BoWClassifier: | |
| """ | |
| BoW (CountVectorizer) + LogisticRegression classifier. | |
| Args: | |
| train_samples: list[Sample] from ml.dataset. If None, uses the full 100-sample dataset. | |
| lemmatize: apply WordNet lemmatization before vectorization. | |
| """ | |
| _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"} | |
| def __init__(self, train_samples=None, lemmatize: bool = False): | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| self._lemmatize = lemmatize | |
| if train_samples is None: | |
| from ml.dataset import get_dataset | |
| train_samples = get_dataset() | |
| texts = [self._preprocess(s.text) for s in train_samples] | |
| labels = [s.label for s in train_samples] | |
| self._vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000) | |
| X = self._vectorizer.fit_transform(texts) | |
| self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42) | |
| self._clf.fit(X, labels) | |
| logger.info( | |
| "BoWClassifier trained on %d samples (lemmatize=%s)", | |
| len(texts), lemmatize, | |
| ) | |
| def _preprocess(self, text: str) -> str: | |
| text = text.lower() | |
| if self._lemmatize: | |
| return " ".join(_lemmatize_tokens(text.split())) | |
| return text | |
| def predict(self, text: str) -> Layer1Result: | |
| processed = self._preprocess(text) | |
| X = self._vectorizer.transform([processed]) | |
| pred_label = int(self._clf.predict(X)[0]) | |
| proba = self._clf.predict_proba(X)[0] | |
| confidence = round(float(max(proba)) * 100, 1) | |
| verdict = self._LABELS[pred_label] | |
| feature_names = self._vectorizer.get_feature_names_out() | |
| bow_scores = X.toarray()[0] | |
| top_idx = bow_scores.argsort()[-5:][::-1] | |
| triggered = [feature_names[i] for i in top_idx if bow_scores[i] > 0] | |
| return Layer1Result(verdict=verdict, confidence=confidence, triggered_features=triggered) | |