Spaces:
Running
Running
Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1 | """ | |
| Evaluate all PhilVerify classifiers on the held-out validation split. | |
| Prints per-class precision/recall/F1, confusion matrix, and a side-by-side | |
| accuracy summary for all model variants: | |
| Classical (trained on train split): | |
| BoW + LogReg | |
| BoW + LogReg + Lemma | |
| TF-IDF + LogReg (legacy SEED_DATA baseline) | |
| TF-IDF + NB | |
| TF-IDF + NB + Lemma | |
| LDA features + LogReg | |
| Transformer (loaded from saved checkpoints): | |
| XLM-RoBERTa | |
| Tagalog-RoBERTa | |
| Ensemble (XLM-R + Tagalog-RoBERTa) | |
| Usage: | |
| cd PhilVerify | |
| python -m ml.eval | |
| python -m ml.eval --seed 42 --train-ratio 0.8 --skip-lda-analysis | |
| """ | |
| import argparse | |
| import logging | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| from ml.bow_classifier import BoWClassifier | |
| from ml.dataset import LABEL_NAMES, get_split | |
| from ml.ensemble_classifier import EnsembleClassifier | |
| from ml.lda_analysis import LDAFeatureClassifier, run_topic_analysis | |
| from ml.naive_bayes_classifier import NaiveBayesClassifier | |
| from ml.tagalog_roberta_classifier import TagalogRobertaClassifier | |
| from ml.tfidf_classifier import TFIDFClassifier | |
| from ml.xlm_roberta_classifier import ModelNotFoundError, XLMRobertaClassifier | |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") | |
| logger = logging.getLogger(__name__) | |
| LABEL_LIST = [LABEL_NAMES[i] for i in sorted(LABEL_NAMES)] | |
| def evaluate_classifier(name: str, clf, samples: list) -> dict: | |
| true_labels, pred_labels = [], [] | |
| for s in samples: | |
| result = clf.predict(s.text) | |
| true_labels.append(LABEL_NAMES[s.label]) | |
| pred_labels.append(result.verdict) | |
| print(f"\n{'='*62}") | |
| print(f" {name}") | |
| print(f"{'='*62}") | |
| print(classification_report(true_labels, pred_labels, labels=LABEL_LIST, zero_division=0)) | |
| print("Confusion matrix (rows = true, cols = predicted):") | |
| print(f" {'':14}", " ".join(f"{lbl[:6]:>6}" for lbl in LABEL_LIST)) | |
| cm = confusion_matrix(true_labels, pred_labels, labels=LABEL_LIST) | |
| for row_label, row in zip(LABEL_LIST, cm): | |
| print(f" {row_label:<14}", " ".join(f"{v:>6}" for v in row)) | |
| acc = accuracy_score(true_labels, pred_labels) | |
| return {"name": name, "accuracy": acc} | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Evaluate PhilVerify classifiers") | |
| parser.add_argument("--seed", type=int, default=42, | |
| help="Random seed (must match training seed)") | |
| parser.add_argument("--train-ratio", type=float, default=0.8, | |
| help="Train split ratio (must match training)") | |
| parser.add_argument("--skip-lda-analysis", action="store_true", | |
| help="Skip the LDA topic analysis printout") | |
| args = parser.parse_args() | |
| train_samples, val_samples = get_split(train_ratio=args.train_ratio, seed=args.seed) | |
| logger.info( | |
| "Train: %d samples | Val: %d samples (seed=%d, train_ratio=%.1f)", | |
| len(train_samples), len(val_samples), args.seed, args.train_ratio, | |
| ) | |
| # ββ LDA topic analysis (printed before classifier comparison) ββββββββββββ | |
| if not args.skip_lda_analysis: | |
| run_topic_analysis(train_samples) | |
| results: list[dict] = [] | |
| # ββ Classical baselines (all trained on train_samples for fair comparison) β | |
| results.append(evaluate_classifier( | |
| "BoW + LogReg", | |
| BoWClassifier(train_samples), | |
| val_samples, | |
| )) | |
| results.append(evaluate_classifier( | |
| "BoW + LogReg + Lemma", | |
| BoWClassifier(train_samples, lemmatize=True), | |
| val_samples, | |
| )) | |
| # Legacy baseline (trains on internal SEED_DATA, not the split β included for reference) | |
| results.append(evaluate_classifier( | |
| "TF-IDF + LogReg [legacy SEED_DATA]", | |
| TFIDFClassifier(), | |
| val_samples, | |
| )) | |
| results.append(evaluate_classifier( | |
| "TF-IDF + NB", | |
| NaiveBayesClassifier(train_samples), | |
| val_samples, | |
| )) | |
| results.append(evaluate_classifier( | |
| "TF-IDF + NB + Lemma", | |
| NaiveBayesClassifier(train_samples, lemmatize=True), | |
| val_samples, | |
| )) | |
| results.append(evaluate_classifier( | |
| "LDA features + LogReg", | |
| LDAFeatureClassifier(train_samples), | |
| val_samples, | |
| )) | |
| # ββ Transformer models βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| xlmr = None | |
| try: | |
| xlmr = XLMRobertaClassifier() | |
| results.append(evaluate_classifier("XLM-RoBERTa", xlmr, val_samples)) | |
| except ModelNotFoundError: | |
| logger.warning("XLM-RoBERTa checkpoint not found β skipping") | |
| tl = None | |
| try: | |
| tl = TagalogRobertaClassifier() | |
| results.append(evaluate_classifier("Tagalog-RoBERTa", tl, val_samples)) | |
| except ModelNotFoundError: | |
| logger.warning("Tagalog-RoBERTa checkpoint not found β skipping") | |
| if xlmr is not None and tl is not None: | |
| ensemble = EnsembleClassifier([xlmr, tl]) | |
| results.append(evaluate_classifier( | |
| "Ensemble (XLM-R + Tagalog-RoBERTa)", ensemble, val_samples | |
| )) | |
| # ββ Summary table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"\n{'='*62}") | |
| print(" Summary") | |
| print(f"{'='*62}") | |
| print(f" {'Model':<44} {'Accuracy':>8}") | |
| print(f" {'-'*44} {'-'*8}") | |
| classical_done = False | |
| for r in results: | |
| is_transformer = any( | |
| kw in r["name"] for kw in ("XLM", "RoBERTa", "Tagalog", "Ensemble") | |
| ) | |
| if is_transformer and not classical_done: | |
| print() # blank separator between classical and transformer sections | |
| classical_done = True | |
| print(f" {r['name']:<44} {r['accuracy'] * 100:>7.1f}%") | |
| best = max(results, key=lambda r: r["accuracy"]) | |
| print(f"\n Best: {best['name']} ({best['accuracy'] * 100:.1f}%)") | |
| print() | |
| if __name__ == "__main__": | |
| main() | |