Instructions to use undertheseanlp/sen-1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use undertheseanlp/sen-1 with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("undertheseanlp/sen-1", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| <html lang="vi"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Sen-1 References - Vietnamese Text Classification</title> | |
| <style> | |
| :root { | |
| --bg: #0d1117; | |
| --surface: #161b22; | |
| --surface2: #1c2333; | |
| --border: #30363d; | |
| --text: #e6edf3; | |
| --text2: #8b949e; | |
| --accent: #58a6ff; | |
| --accent2: #3fb950; | |
| --accent3: #d2a8ff; | |
| --accent4: #f0883e; | |
| --red: #f85149; | |
| --tag-bg: #1f2937; | |
| } | |
| * { margin: 0; padding: 0; box-sizing: border-box; } | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif; | |
| background: var(--bg); | |
| color: var(--text); | |
| line-height: 1.6; | |
| } | |
| a { color: var(--accent); text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| /* Header */ | |
| .header { | |
| background: var(--surface); | |
| border-bottom: 1px solid var(--border); | |
| padding: 1.5rem 2rem; | |
| position: sticky; | |
| top: 0; | |
| z-index: 100; | |
| } | |
| .header-inner { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| gap: 1rem; | |
| } | |
| .logo { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.75rem; | |
| } | |
| .logo-icon { | |
| width: 36px; | |
| height: 36px; | |
| background: linear-gradient(135deg, var(--accent), var(--accent3)); | |
| border-radius: 8px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-size: 18px; | |
| font-weight: 700; | |
| color: #fff; | |
| } | |
| .logo h1 { | |
| font-size: 1.2rem; | |
| font-weight: 600; | |
| } | |
| .logo span { color: var(--text2); font-weight: 400; font-size: 0.9rem; } | |
| .stats { | |
| display: flex; | |
| gap: 1.5rem; | |
| font-size: 0.85rem; | |
| color: var(--text2); | |
| } | |
| .stat-num { color: var(--text); font-weight: 600; font-size: 1.1rem; } | |
| /* Tabs */ | |
| .tabs { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| padding: 0 2rem; | |
| display: flex; | |
| gap: 0; | |
| border-bottom: 1px solid var(--border); | |
| overflow-x: auto; | |
| } | |
| .tab { | |
| padding: 0.75rem 1.25rem; | |
| cursor: pointer; | |
| color: var(--text2); | |
| font-size: 0.9rem; | |
| border-bottom: 2px solid transparent; | |
| white-space: nowrap; | |
| transition: all 0.15s; | |
| } | |
| .tab:hover { color: var(--text); } | |
| .tab.active { | |
| color: var(--text); | |
| border-bottom-color: var(--accent4); | |
| font-weight: 500; | |
| } | |
| /* Content */ | |
| .content { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| padding: 2rem; | |
| } | |
| .panel { display: none; } | |
| .panel.active { display: block; } | |
| /* Paper Cards */ | |
| .papers-grid { | |
| display: grid; | |
| gap: 1rem; | |
| } | |
| .paper-card { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 1.25rem 1.5rem; | |
| transition: border-color 0.15s; | |
| } | |
| .paper-card:hover { border-color: var(--accent); } | |
| .paper-title { | |
| font-size: 1.05rem; | |
| font-weight: 600; | |
| margin-bottom: 0.5rem; | |
| display: flex; | |
| align-items: flex-start; | |
| gap: 0.5rem; | |
| } | |
| .paper-meta { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.75rem; | |
| font-size: 0.85rem; | |
| color: var(--text2); | |
| margin-bottom: 0.5rem; | |
| } | |
| .paper-abstract { | |
| font-size: 0.88rem; | |
| color: var(--text2); | |
| line-height: 1.5; | |
| margin-top: 0.75rem; | |
| display: -webkit-box; | |
| -webkit-line-clamp: 3; | |
| -webkit-box-orient: vertical; | |
| overflow: hidden; | |
| } | |
| .paper-abstract.expanded { | |
| -webkit-line-clamp: unset; | |
| } | |
| .tag { | |
| display: inline-block; | |
| padding: 0.15rem 0.55rem; | |
| border-radius: 12px; | |
| font-size: 0.75rem; | |
| font-weight: 500; | |
| } | |
| .tag-venue { background: #1a3a2a; color: var(--accent2); } | |
| .tag-year { background: #2a1a3a; color: var(--accent3); } | |
| .tag-file { background: var(--tag-bg); color: var(--text2); } | |
| .tag-pdf { background: #3a1a1a; color: var(--red); } | |
| .tag-tex { background: #1a2a3a; color: var(--accent); } | |
| .tag-md { background: #2a2a1a; color: var(--accent4); } | |
| .expand-btn { | |
| background: none; | |
| border: none; | |
| color: var(--accent); | |
| cursor: pointer; | |
| font-size: 0.82rem; | |
| margin-top: 0.3rem; | |
| } | |
| /* Category filters */ | |
| .filters { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.5rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .filter-btn { | |
| padding: 0.4rem 0.9rem; | |
| border-radius: 20px; | |
| border: 1px solid var(--border); | |
| background: var(--surface); | |
| color: var(--text2); | |
| cursor: pointer; | |
| font-size: 0.82rem; | |
| transition: all 0.15s; | |
| } | |
| .filter-btn:hover { border-color: var(--accent); color: var(--text); } | |
| .filter-btn.active { background: var(--accent); color: #000; border-color: var(--accent); font-weight: 600; } | |
| /* Tables */ | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 1rem 0; | |
| font-size: 0.88rem; | |
| } | |
| th, td { | |
| padding: 0.65rem 0.85rem; | |
| text-align: left; | |
| border-bottom: 1px solid var(--border); | |
| } | |
| th { | |
| background: var(--surface2); | |
| color: var(--text2); | |
| font-weight: 600; | |
| font-size: 0.8rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.03em; | |
| position: sticky; | |
| top: 0; | |
| } | |
| tr:hover td { background: var(--surface); } | |
| .highlight { color: var(--accent2); font-weight: 600; } | |
| /* Section headers */ | |
| .section-title { | |
| font-size: 1.4rem; | |
| font-weight: 700; | |
| margin-bottom: 0.5rem; | |
| } | |
| .section-desc { | |
| color: var(--text2); | |
| margin-bottom: 1.5rem; | |
| font-size: 0.92rem; | |
| } | |
| h3 { | |
| font-size: 1.1rem; | |
| margin: 2rem 0 0.75rem; | |
| padding-bottom: 0.4rem; | |
| border-bottom: 1px solid var(--border); | |
| } | |
| /* Citation network */ | |
| .network { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 1.5rem; | |
| font-family: 'SF Mono', 'Fira Code', 'Consolas', monospace; | |
| font-size: 0.82rem; | |
| line-height: 1.8; | |
| overflow-x: auto; | |
| white-space: pre; | |
| color: var(--text2); | |
| } | |
| .network b { color: var(--accent); font-weight: 600; } | |
| .network em { color: var(--accent2); font-style: normal; } | |
| .network u { color: var(--accent3); text-decoration: none; } | |
| /* Blockquote */ | |
| blockquote { | |
| border-left: 3px solid var(--accent3); | |
| padding: 0.75rem 1.25rem; | |
| margin: 1rem 0; | |
| background: var(--surface); | |
| border-radius: 0 6px 6px 0; | |
| font-style: italic; | |
| color: var(--text2); | |
| } | |
| blockquote strong { color: var(--text); } | |
| /* Cards row */ | |
| .cards-row { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
| gap: 1rem; | |
| margin: 1rem 0; | |
| } | |
| .info-card { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 1.25rem; | |
| } | |
| .info-card h4 { | |
| font-size: 0.95rem; | |
| margin-bottom: 0.5rem; | |
| color: var(--accent); | |
| } | |
| .info-card p, .info-card li { | |
| font-size: 0.85rem; | |
| color: var(--text2); | |
| line-height: 1.6; | |
| } | |
| .info-card ul { padding-left: 1.2rem; } | |
| /* How to do science */ | |
| .principle { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 1.25rem 1.5rem; | |
| margin-bottom: 1rem; | |
| } | |
| .principle h4 { | |
| color: var(--accent4); | |
| margin-bottom: 0.5rem; | |
| } | |
| .principle p, .principle li { | |
| font-size: 0.88rem; | |
| color: var(--text2); | |
| line-height: 1.6; | |
| } | |
| .principle ul { padding-left: 1.2rem; margin-top: 0.3rem; } | |
| /* Leaderboard */ | |
| .lb-hero { | |
| text-align: center; | |
| padding: 2.5rem 1rem 1.5rem; | |
| } | |
| .lb-hero h2 { | |
| font-size: 2rem; | |
| font-weight: 800; | |
| background: linear-gradient(135deg, var(--accent), var(--accent3)); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| } | |
| .lb-hero p { | |
| color: var(--text2); | |
| font-size: 0.92rem; | |
| margin-top: 0.5rem; | |
| } | |
| .lb-updated { | |
| color: var(--text2); | |
| font-size: 0.75rem; | |
| margin-top: 0.25rem; | |
| opacity: 0.7; | |
| } | |
| .lb-spotlight { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); | |
| gap: 1rem; | |
| margin: 1.5rem 0; | |
| } | |
| .lb-spot-card { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| padding: 1.25rem; | |
| transition: border-color 0.15s; | |
| } | |
| .lb-spot-card:hover { border-color: var(--accent); } | |
| .lb-spot-card h4 { | |
| font-size: 0.78rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| color: var(--text2); | |
| margin-bottom: 0.15rem; | |
| } | |
| .lb-spot-card .lb-bench-name { | |
| font-size: 0.72rem; | |
| color: var(--accent); | |
| margin-bottom: 0.75rem; | |
| } | |
| .lb-rank-list { list-style: none; } | |
| .lb-rank-list li { | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| padding: 0.35rem 0; | |
| font-size: 0.85rem; | |
| border-bottom: 1px solid rgba(48,54,61,0.5); | |
| } | |
| .lb-rank-list li:last-child { border-bottom: none; } | |
| .lb-rank-num { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| width: 22px; | |
| height: 22px; | |
| border-radius: 50%; | |
| font-size: 0.7rem; | |
| font-weight: 700; | |
| margin-right: 0.6rem; | |
| flex-shrink: 0; | |
| } | |
| .lb-rank-1 { background: #fbbf24; color: #000; } | |
| .lb-rank-2 { background: #94a3b8; color: #000; } | |
| .lb-rank-3 { background: #b45309; color: #fff; } | |
| .lb-rank-other { background: var(--tag-bg); color: var(--text2); } | |
| .lb-model-name { flex: 1; } | |
| .lb-model-name.lb-sen1 { color: var(--accent4); font-weight: 600; } | |
| .lb-score { | |
| font-weight: 600; | |
| font-variant-numeric: tabular-nums; | |
| color: var(--text); | |
| } | |
| .lb-score-top { color: var(--accent2); } | |
| .lb-section-title { | |
| font-size: 1.1rem; | |
| font-weight: 700; | |
| margin: 2.5rem 0 0.5rem; | |
| padding-bottom: 0.4rem; | |
| border-bottom: 1px solid var(--border); | |
| } | |
| .lb-section-desc { | |
| color: var(--text2); | |
| font-size: 0.85rem; | |
| margin-bottom: 1rem; | |
| } | |
| /* Leaderboard table */ | |
| .lb-table-wrap { | |
| overflow-x: auto; | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| margin: 1rem 0; | |
| } | |
| .lb-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 0.84rem; | |
| min-width: 900px; | |
| } | |
| .lb-table th { | |
| background: var(--surface2); | |
| color: var(--text2); | |
| font-weight: 600; | |
| font-size: 0.72rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| padding: 0.7rem 0.75rem; | |
| border-bottom: 2px solid var(--border); | |
| cursor: pointer; | |
| white-space: nowrap; | |
| user-select: none; | |
| position: sticky; | |
| top: 0; | |
| } | |
| .lb-table th:hover { color: var(--text); } | |
| .lb-table th .sort-arrow { margin-left: 0.3rem; font-size: 0.65rem; color: var(--accent); } | |
| .lb-table td { | |
| padding: 0.6rem 0.75rem; | |
| border-bottom: 1px solid rgba(48,54,61,0.5); | |
| white-space: nowrap; | |
| } | |
| .lb-table tr:hover td { background: rgba(88,166,255,0.04); } | |
| .lb-table .lb-row-sen1 { background: rgba(240,136,62,0.06); } | |
| .lb-table .lb-row-sen1:hover td { background: rgba(240,136,62,0.1); } | |
| .lb-table .model-type { | |
| display: inline-block; | |
| width: 8px; | |
| height: 24px; | |
| border-radius: 2px; | |
| margin-right: 0.5rem; | |
| vertical-align: middle; | |
| } | |
| .type-traditional { background: var(--accent4); } | |
| .type-transformer { background: var(--accent); } | |
| .type-multilingual { background: var(--accent3); } | |
| .lb-table .cell-best { color: var(--accent2); font-weight: 700; } | |
| .lb-table .cell-na { color: var(--text2); opacity: 0.5; } | |
| .lb-table .model-cell { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 768px) { | |
| .header-inner { flex-direction: column; align-items: flex-start; } | |
| .content { padding: 1rem; } | |
| .cards-row { grid-template-columns: 1fr; } | |
| .lb-spotlight { grid-template-columns: 1fr; } | |
| .lb-hero h2 { font-size: 1.5rem; } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="header"> | |
| <div class="header-inner"> | |
| <div class="logo"> | |
| <div class="logo-icon">S1</div> | |
| <div> | |
| <h1>Sen-1 References <span>Vietnamese Text Classification</span></h1> | |
| </div> | |
| </div> | |
| <div class="stats"> | |
| <div><span class="stat-num" id="total-papers">12</span> papers</div> | |
| <div><span class="stat-num">10</span> PDFs</div> | |
| <div><span class="stat-num">7</span> LaTeX</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="tabs" id="tabs"> | |
| <div class="tab active" data-tab="papers">Papers</div> | |
| <div class="tab" data-tab="comparison">Benchmarks</div> | |
| <div class="tab" data-tab="sota">SOTA</div> | |
| <div class="tab" data-tab="network">Citation Network</div> | |
| <div class="tab" data-tab="leaderboard">Leaderboard</div> | |
| <div class="tab" data-tab="science">How to Do Science</div> | |
| </div> | |
| <!-- ===================== PAPERS ===================== --> | |
| <div class="content"> | |
| <div class="panel active" id="panel-papers"> | |
| <div class="section-title">Paper Database</div> | |
| <div class="section-desc">Research papers related to Vietnamese text classification, fetched from arXiv and ACL Anthology.</div> | |
| <div class="filters"> | |
| <button class="filter-btn active" data-cat="all">All (12)</button> | |
| <button class="filter-btn" data-cat="vn-cls">Vietnamese Classification</button> | |
| <button class="filter-btn" data-cat="vn-pretrained">Vietnamese Pretrained</button> | |
| <button class="filter-btn" data-cat="multilingual">Multilingual Models</button> | |
| <button class="filter-btn" data-cat="datasets">Datasets</button> | |
| </div> | |
| <div class="papers-grid" id="papers-grid"> | |
| <!-- PhoBERT --> | |
| <div class="paper-card" data-cats="vn-pretrained"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/2003.00744" target="_blank">PhoBERT: Pre-trained language models for Vietnamese</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2020</span> | |
| <span class="tag tag-venue">EMNLP Findings</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Dat Quoc Nguyen, Anh Tuan Nguyen</span> | |
| </div> | |
| <div class="paper-abstract">We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent best pre-trained multilingual model XLM-R and improves the state-of-the-art in multiple Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and Natural language inference.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> | |
| <code>2020.arxiv.nguyen/</code> · <code>2020.findings.anh/</code> | |
| </div> | |
| </div> | |
| <!-- ViSoBERT --> | |
| <div class="paper-card" data-cats="vn-pretrained"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/2310.11166" target="_blank">ViSoBERT: A Pre-Trained Language Model for Vietnamese Social Media Text Processing</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2023</span> | |
| <span class="tag tag-venue">EMNLP 2023</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Quoc-Nam Nguyen, Thang Chau Phan, Duc-Vu Nguyen, Kiet Van Nguyen</span> | |
| </div> | |
| <div class="paper-abstract">We present the first monolingual pre-trained language model for Vietnamese social media texts, ViSoBERT, which is pre-trained on a large-scale corpus of high-quality and diverse Vietnamese social media texts using XLM-R architecture. ViSoBERT surpasses the previous state-of-the-art models on multiple Vietnamese social media tasks with far fewer parameters.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> | |
| <code>2023.arxiv.nguyen/</code> · <code>2023.emnlp.kiet/</code> | |
| </div> | |
| </div> | |
| <!-- vELECTRA --> | |
| <div class="paper-card" data-cats="vn-pretrained"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/2006.15994" target="_blank">Improving Sequence Tagging for Vietnamese Text Using Transformer-based Neural Models</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2020</span> | |
| <span class="tag tag-venue">arXiv</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Viet Bui The, Oanh Tran Thi, Phuong Le-Hong</span> | |
| </div> | |
| <div class="paper-abstract">Introduces viBERT (trained on 10GB) and vELECTRA (trained on 60GB) Vietnamese pretrained models. Strong performance on sequence tagging and text classification tasks. vELECTRA achieves 95.26% on ViOCD complaint classification in the SMTCE benchmark.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2020.arxiv.the/</code></div> | |
| </div> | |
| <!-- SMTCE --> | |
| <div class="paper-card" data-cats="vn-cls"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/2209.10482" target="_blank">SMTCE: A Social Media Text Classification Evaluation Benchmark and BERTology Models for Vietnamese</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2022</span> | |
| <span class="tag tag-venue">PACLIC 2022</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Luan Thanh Nguyen, Kiet Van Nguyen, Ngan Luu-Thuy Nguyen</span> | |
| </div> | |
| <div class="paper-abstract">GLUE-inspired benchmark for Vietnamese social media text classification. Compares multilingual (mBERT, XLM-R, DistilmBERT) and monolingual (PhoBERT, viBERT, vELECTRA, viBERT4news) BERT models. Monolingual models consistently outperform multilingual for Vietnamese.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"> | |
| <code>2022.arxiv.nguyen/</code> · <code>2022.paclic.ngan/</code> | |
| </div> | |
| </div> | |
| <!-- Vu et al 2007 --> | |
| <div class="paper-card" data-cats="vn-cls"> | |
| <div class="paper-title"> | |
| <a href="https://ieeexplore.ieee.org/document/4223084/" target="_blank">A Comparative Study on Vietnamese Text Classification Methods</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2007</span> | |
| <span class="tag tag-venue">IEEE RIVF</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Cong Duy Vu Hoang, Dien Dinh, Le Nguyen Nguyen, Quoc Hung Ngo</span> | |
| </div> | |
| <div class="paper-abstract">Seminal paper introducing VNTC corpus and comparing BOW and N-gram language model approaches for Vietnamese text classification. N-gram LM achieves 97.1% accuracy, SVM Multi achieves 93.4% on 10-topic news classification. The VNTC dataset remains the standard benchmark.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2007.rivf.hoang/</code></div> | |
| </div> | |
| <!-- RoBERTa --> | |
| <div class="paper-card" data-cats="multilingual"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/1907.11692" target="_blank">RoBERTa: A Robustly Optimized BERT Pretraining Approach</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2019</span> | |
| <span class="tag tag-venue">arXiv</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Yinhan Liu, Myle Ott, Naman Goyal, ...</span> | |
| </div> | |
| <div class="paper-abstract">PhoBERT is based on the RoBERTa architecture. Key optimizations over BERT: dynamic masking, larger batches, more training data, removal of Next Sentence Prediction (NSP). Foundation for most Vietnamese pretrained models.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.liu/</code></div> | |
| </div> | |
| <!-- XLM-R --> | |
| <div class="paper-card" data-cats="multilingual"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/1911.02116" target="_blank">Unsupervised Cross-lingual Representation Learning at Scale (XLM-RoBERTa)</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2019</span> | |
| <span class="tag tag-venue">ACL 2020</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Alexis Conneau, Kartikay Khandelwal, Naman Goyal, ...</span> | |
| </div> | |
| <div class="paper-abstract">Multilingual pretrained model trained on 100 languages (2.5TB CC-100). Strong multilingual baseline for Vietnamese, but consistently outperformed by monolingual PhoBERT on Vietnamese-specific tasks.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.conneau/</code></div> | |
| </div> | |
| <!-- UIT-VSMEC --> | |
| <div class="paper-card" data-cats="datasets"> | |
| <div class="paper-title"> | |
| <a href="https://arxiv.org/abs/1911.09339" target="_blank">Emotion Recognition for Vietnamese Social Media Text (UIT-VSMEC)</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2019</span> | |
| <span class="tag tag-venue">CSoNet 2020</span> | |
| <span class="tag tag-pdf">PDF</span> | |
| <span class="tag tag-tex">TEX</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Vong Anh Ho, Duong Huynh-Cong Nguyen, Danh Hoang Nguyen, ...</span> | |
| </div> | |
| <div class="paper-abstract">Introduces UIT-VSMEC corpus: 6,927 emotion-annotated Vietnamese social media sentences with 7 labels (sadness, enjoyment, anger, disgust, fear, surprise, other). CNN baseline achieves 59.74% weighted F1.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2019.arxiv.ho/</code></div> | |
| </div> | |
| <!-- UIT-VSFC --> | |
| <div class="paper-card" data-cats="datasets"> | |
| <div class="paper-title"> | |
| <a href="https://ieeexplore.ieee.org/document/8573337/" target="_blank">UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis</a> | |
| </div> | |
| <div class="paper-meta"> | |
| <span class="tag tag-year">2018</span> | |
| <span class="tag tag-venue">KSE 2018</span> | |
| <span class="tag tag-md">MD</span> | |
| <span>Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, ...</span> | |
| </div> | |
| <div class="paper-abstract">16,175 Vietnamese student feedback sentences annotated for sentiment (3 classes: positive, negative, neutral) and topic classification. Inter-annotator agreement: 91.20% for sentiment. MaxEnt baseline: 88% sentiment F1.</div> | |
| <div style="font-size:0.78rem;color:var(--text2);margin-top:0.5rem"><code>2018.kse.nguyen/</code></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===================== BENCHMARKS ===================== --> | |
| <div class="panel" id="panel-comparison"> | |
| <div class="section-title">Benchmark Comparison</div> | |
| <div class="section-desc">Vietnamese text classification results across datasets and models.</div> | |
| <h3>VNTC Dataset (10-topic News Classification)</h3> | |
| <table> | |
| <tr><th>Model</th><th>Year</th><th>Accuracy</th><th>F1 (weighted)</th><th>Training</th><th>Inference</th><th>Size</th></tr> | |
| <tr><td>N-gram LM (Vu et al.)</td><td>2007</td><td class="highlight">97.1%</td><td>-</td><td>~79 min</td><td>-</td><td>-</td></tr> | |
| <tr><td>SVM Multi (Vu et al.)</td><td>2007</td><td>93.4%</td><td>-</td><td>~79 min</td><td>-</td><td>-</td></tr> | |
| <tr><td>sonar_core_1 (SVC)</td><td>-</td><td>92.80%</td><td>92.0%</td><td>~54.6 min</td><td>-</td><td>~75MB</td></tr> | |
| <tr style="background:var(--surface2)"><td><strong>Sen-1 (LinearSVC)</strong></td><td>2026</td><td>92.49%</td><td>92.40%</td><td class="highlight">37.6s</td><td class="highlight">66K/sec</td><td class="highlight">2.4MB</td></tr> | |
| <tr><td>PhoBERT-base*</td><td>2020</td><td>~95-97%</td><td>~95%</td><td>Hours (GPU)</td><td>~20/sec</td><td>~400MB</td></tr> | |
| </table> | |
| <p style="font-size:0.78rem;color:var(--text2)">*PhoBERT not directly evaluated on VNTC; estimates from similar tasks.</p> | |
| <h3>UTS2017_Bank Dataset (14-category Banking)</h3> | |
| <table> | |
| <tr><th>Model</th><th>Accuracy</th><th>F1 (weighted)</th><th>F1 (macro)</th><th>Training</th></tr> | |
| <tr style="background:var(--surface2)"><td><strong>Sen-1</strong></td><td class="highlight">75.76%</td><td class="highlight">72.70%</td><td>36.18%</td><td class="highlight">0.13s</td></tr> | |
| <tr><td>sonar_core_1</td><td>72.47%</td><td>66.0%</td><td>-</td><td>~5.3s</td></tr> | |
| </table> | |
| <h3>Vietnamese Pretrained Models</h3> | |
| <table> | |
| <tr><th>Model</th><th>Architecture</th><th>Pre-training Data</th><th>Languages</th><th>Vietnamese Tasks</th></tr> | |
| <tr><td><strong>PhoBERT</strong></td><td>RoBERTa</td><td>20GB Vietnamese</td><td>1 (vi)</td><td>SOTA: POS, NER, NLI</td></tr> | |
| <tr><td><strong>ViSoBERT</strong></td><td>XLM-R</td><td>Social media corpus</td><td>1 (vi)</td><td>SOTA: social media tasks</td></tr> | |
| <tr><td><strong>vELECTRA</strong></td><td>ELECTRA</td><td>60GB Vietnamese</td><td>1 (vi)</td><td>Strong on classification</td></tr> | |
| <tr><td>viBERT</td><td>BERT</td><td>10GB Vietnamese</td><td>1 (vi)</td><td>Baseline</td></tr> | |
| <tr><td>XLM-R</td><td>RoBERTa</td><td>CC-100 (2.5TB)</td><td>100</td><td>Strong multilingual</td></tr> | |
| <tr><td>mBERT</td><td>BERT</td><td>Wikipedia</td><td>104</td><td>Weakest on Vietnamese</td></tr> | |
| </table> | |
| <h3>SMTCE Benchmark (Best model per task)</h3> | |
| <table> | |
| <tr><th>Task</th><th>Best Model</th><th>Score</th><th>Runner-up</th></tr> | |
| <tr><td>UIT-VSMEC (Emotion)</td><td class="highlight">PhoBERT</td><td>65.44% F1</td><td>viBERT4news</td></tr> | |
| <tr><td>ViOCD (Complaint)</td><td class="highlight">vELECTRA</td><td>95.26% F1</td><td>PhoBERT</td></tr> | |
| <tr><td>ViHSD (Hate Speech)</td><td class="highlight">PhoBERT</td><td>-</td><td>XLM-R</td></tr> | |
| <tr><td>ViCTSD (Constructive)</td><td class="highlight">PhoBERT</td><td>-</td><td>vELECTRA</td></tr> | |
| <tr><td>UIT-VSFC (Sentiment)</td><td class="highlight">PhoBERT</td><td>-</td><td>viBERT</td></tr> | |
| </table> | |
| <h3>Model Efficiency</h3> | |
| <table> | |
| <tr><th>Model</th><th>Size</th><th>VNTC Accuracy</th><th>Efficiency (Acc/MB)</th></tr> | |
| <tr style="background:var(--surface2)"><td><strong>Sen-1</strong></td><td class="highlight">2.4 MB</td><td>92.49%</td><td class="highlight">38.5</td></tr> | |
| <tr><td>PhoBERT-base</td><td>~400 MB</td><td>~95%</td><td>0.24</td></tr> | |
| <tr><td>XLM-R-base</td><td>~1.1 GB</td><td>~93%</td><td>0.08</td></tr> | |
| </table> | |
| <p style="margin-top:0.75rem;font-size:0.9rem;color:var(--accent2)"><strong>Sen-1 is ~160x more efficient</strong> in accuracy-per-MB than PhoBERT.</p> | |
| </div> | |
| <!-- ===================== SOTA ===================== --> | |
| <div class="panel" id="panel-sota"> | |
| <div class="section-title">State-of-the-Art</div> | |
| <div class="section-desc">Current SOTA for Vietnamese text classification tasks (as of 2026).</div> | |
| <table> | |
| <tr><th>Task</th><th>Dataset</th><th>SOTA Model</th><th>Score</th><th>Paper</th></tr> | |
| <tr><td>News Classification</td><td>VNTC</td><td>N-gram LM</td><td class="highlight">97.1% Acc</td><td>Vu et al. 2007</td></tr> | |
| <tr><td>Emotion Recognition</td><td>UIT-VSMEC</td><td class="highlight">ViSoBERT</td><td>SOTA F1</td><td>Nguyen et al. 2023</td></tr> | |
| <tr><td>Sentiment Analysis</td><td>UIT-VSFC</td><td class="highlight">PhoBERT</td><td>SOTA F1</td><td>SMTCE 2022</td></tr> | |
| <tr><td>Hate Speech</td><td>ViHSD</td><td class="highlight">PhoBERT/ViSoBERT</td><td>SOTA F1</td><td>SMTCE/ViSoBERT</td></tr> | |
| <tr><td>Complaint Detection</td><td>ViOCD</td><td class="highlight">vELECTRA</td><td>95.26% F1</td><td>SMTCE 2022</td></tr> | |
| <tr><td>Spam Reviews</td><td>ViSpamReviews</td><td class="highlight">ViSoBERT</td><td>SOTA F1</td><td>Nguyen et al. 2023</td></tr> | |
| </table> | |
| <h3>Key Trends</h3> | |
| <div class="cards-row"> | |
| <div class="info-card"> | |
| <h4>Monolingual > Multilingual</h4> | |
| <p>PhoBERT, ViSoBERT, vELECTRA consistently outperform XLM-R, mBERT on Vietnamese tasks.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Domain-specific Pretraining</h4> | |
| <p>ViSoBERT (social media) outperforms PhoBERT (general) on social media tasks.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Traditional ML Still Competitive</h4> | |
| <p>TF-IDF + SVM achieves 92%+ on news classification with 160x less resources.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Word Segmentation Matters</h4> | |
| <p>~5% accuracy gap between syllable-level (Sen-1) and word-level approaches.</p> | |
| </div> | |
| </div> | |
| <h3>Sen-1 Position</h3> | |
| <div class="network"><b> Accuracy</b> | |
| High ^ | |
| | <b>PhoBERT</b> <u>ViSoBERT</u> | |
| | * * | |
| | | |
| | <em>N-gram (2007)</em> | |
| | * | |
| | <b style="color:var(--accent4)">Sen-1</b> | |
| | * | |
| | | |
| Low | | |
| +-------------------------------> | |
| Fast Slow | |
| Inference Speed</div> | |
| <p style="margin-top:1rem;font-size:0.9rem;color:var(--text2)"> | |
| Sen-1 = <strong style="color:var(--accent4)">fast + lightweight</strong> quadrant: edge deployment, real-time batch processing, resource-constrained environments. | |
| </p> | |
| <h3>Open Questions</h3> | |
| <div class="cards-row"> | |
| <div class="info-card"> | |
| <h4>RQ1</h4> | |
| <p>Can word segmentation close the gap between Sen-1 and PhoBERT?</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>RQ2</h4> | |
| <p>How does Sen-1 perform on social media/informal text?</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>RQ3</h4> | |
| <p>Can ensemble (Sen-1 + lightweight transformer) get speed + accuracy?</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>RQ4</h4> | |
| <p>Minimum dataset size where PhoBERT outperforms TF-IDF+SVM?</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===================== CITATION NETWORK ===================== --> | |
| <div class="panel" id="panel-network"> | |
| <div class="section-title">Citation Network</div> | |
| <div class="section-desc">How the papers in this collection relate to each other and to Sen-1.</div> | |
| <div class="network"><b>Vu et al. 2007</b> (VNTC dataset) | |
| | | |
| +---> Vietnamese text classification research | |
| | | | |
| | <b>RoBERTa</b> (2019) ---> <b>PhoBERT</b> (2020) ---> <u>ViSoBERT</u> (2023) | |
| | | | | |
| | <b>XLM-R</b> (2019) ------> <em>vELECTRA</em> (2020) <em>SMTCE benchmark</em> (2022) | |
| | | | | |
| | <b style="color:var(--accent4)">Sen-2</b> (future) UIT-VSMEC, UIT-VSFC | |
| | | |
| +---> <b style="color:var(--accent4)">Sen-1</b> (TF-IDF + SVM baseline) | |
| | | |
| +---> 92.49% VNTC | 75.76% UTS2017_Bank | |
| | | |
| +---> Phase 2: word segmentation, PhoBERT comparison | |
| | | |
| +---> Phase 3: <b style="color:var(--accent4)">Sen-2</b> (PhoBERT-based)</div> | |
| <h3>Available Datasets</h3> | |
| <table> | |
| <tr><th>Dataset</th><th>Task</th><th>Samples</th><th>Classes</th><th>Domain</th><th>Source</th></tr> | |
| <tr><td><strong>VNTC</strong></td><td>Topic</td><td>84,132</td><td>10</td><td>News</td><td><a href="https://github.com/duyvuleo/VNTC">GitHub</a></td></tr> | |
| <tr><td><strong>UTS2017_Bank</strong></td><td>Intent</td><td>1,977</td><td>14</td><td>Banking</td><td>HuggingFace</td></tr> | |
| <tr><td><strong>UIT-VSMEC</strong></td><td>Emotion</td><td>6,927</td><td>7</td><td>Social media</td><td>UIT NLP</td></tr> | |
| <tr><td><strong>UIT-VSFC</strong></td><td>Sentiment</td><td>16,175</td><td>3</td><td>Education</td><td><a href="https://huggingface.co/datasets/uitnlp/vietnamese_students_feedback">HuggingFace</a></td></tr> | |
| <tr><td><strong>SMTCE</strong></td><td>Multi-task</td><td>Multiple</td><td>Various</td><td>Social media</td><td><a href="https://arxiv.org/abs/2209.10482">arXiv</a></td></tr> | |
| </table> | |
| <h3>Research Gaps</h3> | |
| <div class="cards-row"> | |
| <div class="info-card"> | |
| <h4>Gap 1</h4> | |
| <p>No comprehensive TF-IDF vs PhoBERT comparison on same Vietnamese benchmarks with controlled experiments.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Gap 2</h4> | |
| <p>Limited edge/resource-constrained deployment studies. Most work focuses on accuracy, not efficiency.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Gap 3</h4> | |
| <p>Class imbalance handling for Vietnamese datasets is under-explored.</p> | |
| </div> | |
| <div class="info-card"> | |
| <h4>Gap 4</h4> | |
| <p>Cross-domain evaluation and ablation studies for Vietnamese features are rare.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===================== LEADERBOARD ===================== --> | |
| <div class="panel" id="panel-leaderboard"> | |
| <div class="lb-hero"> | |
| <h2>Vietnamese Text Classification Leaderboard</h2> | |
| <p>Comprehensive comparison of models across Vietnamese NLP benchmarks, speed, and efficiency.</p> | |
| <div class="lb-updated">Updated: February 2026 · Inspired by <a href="https://www.vellum.ai/llm-leaderboard" target="_blank">Vellum LLM Leaderboard</a></div> | |
| </div> | |
| <!-- ===== Spotlight: Quality Benchmarks ===== --> | |
| <div class="lb-section-title">Quality Benchmarks</div> | |
| <div class="lb-section-desc">Top models per dataset, ranked by primary metric.</div> | |
| <div class="lb-spotlight"> | |
| <!-- VNTC --> | |
| <div class="lb-spot-card"> | |
| <h4>News Classification</h4> | |
| <div class="lb-bench-name">VNTC (10 topics, 84K samples)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">N-gram LM</span><span class="lb-score lb-score-top">97.1%</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT-base*</span><span class="lb-score">~95%</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">SVM Multi</span><span class="lb-score">93.4%</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">92.80%</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">5</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score">92.49%</span></li> | |
| </ol> | |
| </div> | |
| <!-- UTS2017_Bank --> | |
| <div class="lb-spot-card"> | |
| <h4>Banking Classification</h4> | |
| <div class="lb-bench-name">UTS2017_Bank (14 categories, 1.9K samples)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">75.76%</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">72.47%</span></li> | |
| </ol> | |
| </div> | |
| <!-- Emotion --> | |
| <div class="lb-spot-card"> | |
| <h4>Emotion Recognition</h4> | |
| <div class="lb-bench-name">UIT-VSMEC (7 classes, 6.9K samples)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">ViSoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT</span><span class="lb-score">65.44%</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">viBERT4news</span><span class="lb-score">-</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">CNN baseline</span><span class="lb-score">59.74%</span></li> | |
| </ol> | |
| </div> | |
| <!-- Sentiment --> | |
| <div class="lb-spot-card"> | |
| <h4>Sentiment Analysis</h4> | |
| <div class="lb-bench-name">UIT-VSFC (3 classes, 16K samples)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">PhoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">viBERT</span><span class="lb-score">-</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">MaxEnt baseline</span><span class="lb-score">88%</span></li> | |
| </ol> | |
| </div> | |
| <!-- Complaint --> | |
| <div class="lb-spot-card"> | |
| <h4>Complaint Detection</h4> | |
| <div class="lb-bench-name">ViOCD (SMTCE benchmark)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">vELECTRA</span><span class="lb-score lb-score-top">95.26%</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">PhoBERT</span><span class="lb-score">-</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">XLM-R</span><span class="lb-score">-</span></li> | |
| </ol> | |
| </div> | |
| <!-- Hate Speech --> | |
| <div class="lb-spot-card"> | |
| <h4>Hate Speech Detection</h4> | |
| <div class="lb-bench-name">ViHSD (SMTCE benchmark)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name">PhoBERT</span><span class="lb-score lb-score-top">SOTA</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">ViSoBERT</span><span class="lb-score">-</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">XLM-R</span><span class="lb-score">-</span></li> | |
| </ol> | |
| </div> | |
| </div> | |
| <!-- ===== Spotlight: Performance ===== --> | |
| <div class="lb-section-title">Performance Metrics</div> | |
| <div class="lb-section-desc">Speed, latency, and efficiency rankings.</div> | |
| <div class="lb-spotlight"> | |
| <!-- Fastest --> | |
| <div class="lb-spot-card"> | |
| <h4>Fastest Inference</h4> | |
| <div class="lb-bench-name">Batch throughput (samples/sec)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">66,678/s</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">TF-IDF + SVM (sklearn)</span><span class="lb-score">~50K/s</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT (GPU)</span><span class="lb-score">~20/s</span></li> | |
| </ol> | |
| </div> | |
| <!-- Smallest --> | |
| <div class="lb-spot-card"> | |
| <h4>Smallest Model</h4> | |
| <div class="lb-bench-name">Model file size</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">2.4 MB</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">~75 MB</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT-base</span><span class="lb-score">~400 MB</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">XLM-R-base</span><span class="lb-score">~1.1 GB</span></li> | |
| </ol> | |
| </div> | |
| <!-- Most Efficient --> | |
| <div class="lb-spot-card"> | |
| <h4>Most Efficient</h4> | |
| <div class="lb-bench-name">Accuracy per MB (VNTC)</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1</span><span class="lb-score lb-score-top">38.5</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">1.24</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">PhoBERT-base</span><span class="lb-score">0.24</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">XLM-R-base</span><span class="lb-score">0.08</span></li> | |
| </ol> | |
| </div> | |
| <!-- Fastest Training --> | |
| <div class="lb-spot-card"> | |
| <h4>Fastest Training</h4> | |
| <div class="lb-bench-name">VNTC full training time</div> | |
| <ol class="lb-rank-list"> | |
| <li><span class="lb-rank-num lb-rank-1">1</span><span class="lb-model-name lb-sen1">Sen-1 (Rust)</span><span class="lb-score lb-score-top">37.6s</span></li> | |
| <li><span class="lb-rank-num lb-rank-2">2</span><span class="lb-model-name">TF-IDF+SVM (sklearn)</span><span class="lb-score">~2 min</span></li> | |
| <li><span class="lb-rank-num lb-rank-3">3</span><span class="lb-model-name">sonar_core_1</span><span class="lb-score">54.6 min</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">4</span><span class="lb-model-name">N-gram LM</span><span class="lb-score">~79 min</span></li> | |
| <li><span class="lb-rank-num lb-rank-other">5</span><span class="lb-model-name">PhoBERT fine-tune</span><span class="lb-score">Hours</span></li> | |
| </ol> | |
| </div> | |
| </div> | |
| <!-- ===== Comprehensive Table ===== --> | |
| <div class="lb-section-title">Comprehensive Comparison</div> | |
| <div class="lb-section-desc">All models with operational and benchmark metrics. Click column headers to sort.</div> | |
| <div style="display:flex;gap:0.5rem;margin-bottom:1rem;flex-wrap:wrap"> | |
| <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-traditional" style="width:10px;height:10px;display:inline-block"></span> Traditional ML</span> | |
| <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-transformer" style="width:10px;height:10px;display:inline-block"></span> Vietnamese Transformer</span> | |
| <span style="display:inline-flex;align-items:center;gap:0.35rem;font-size:0.78rem;color:var(--text2)"><span class="model-type type-multilingual" style="width:10px;height:10px;display:inline-block"></span> Multilingual</span> | |
| </div> | |
| <div class="lb-table-wrap"> | |
| <table class="lb-table" id="lb-main-table"> | |
| <thead> | |
| <tr> | |
| <th data-col="0" data-type="num">#</th> | |
| <th data-col="1" data-type="str">Model</th> | |
| <th data-col="2" data-type="str">Type</th> | |
| <th data-col="3" data-type="str">Architecture</th> | |
| <th data-col="4" data-type="num">Size</th> | |
| <th data-col="5" data-type="num">VNTC<br>Acc %</th> | |
| <th data-col="6" data-type="num">UTS2017<br>Acc %</th> | |
| <th data-col="7" data-type="num">UIT-VSMEC<br>F1 %</th> | |
| <th data-col="8" data-type="num">ViOCD<br>F1 %</th> | |
| <th data-col="9" data-type="num">Training</th> | |
| <th data-col="10" data-type="num">Inference<br>/sec</th> | |
| <th data-col="11" data-type="num">Eff.<br>Acc/MB</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr><td>1</td><td><div class="model-cell"><span class="model-type type-traditional"></span>N-gram LM</div></td><td>Traditional</td><td>N-gram Language Model</td><td class="cell-na">n/a</td><td class="cell-best">97.1</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>~79 min</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>2</td><td><div class="model-cell"><span class="model-type type-transformer"></span>PhoBERT-base</div></td><td>Transformer</td><td>RoBERTa (20GB vi)</td><td>~400 MB</td><td>~95</td><td class="cell-na">n/a</td><td>65.44</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td>~20</td><td>0.24</td></tr> | |
| <tr><td>3</td><td><div class="model-cell"><span class="model-type type-transformer"></span>ViSoBERT</div></td><td>Transformer</td><td>XLM-R (social media)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">SOTA</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>4</td><td><div class="model-cell"><span class="model-type type-transformer"></span>vELECTRA</div></td><td>Transformer</td><td>ELECTRA (60GB vi)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">95.26</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>5</td><td><div class="model-cell"><span class="model-type type-traditional"></span>SVM Multi</div></td><td>Traditional</td><td>SVM + BOW</td><td class="cell-na">n/a</td><td>93.4</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>~79 min</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>6</td><td><div class="model-cell"><span class="model-type type-traditional"></span>sonar_core_1</div></td><td>Traditional</td><td>TF-IDF + SVC (RBF)</td><td>~75 MB</td><td>92.80</td><td>72.47</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>54.6 min</td><td class="cell-na">n/a</td><td>1.24</td></tr> | |
| <tr class="lb-row-sen1"><td>7</td><td><div class="model-cell"><span class="model-type type-traditional"></span><strong style="color:var(--accent4)">Sen-1</strong></div></td><td>Traditional</td><td>TF-IDF + LinearSVC (Rust)</td><td class="cell-best">2.4 MB</td><td>92.49</td><td class="cell-best">75.76</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-best">37.6s</td><td class="cell-best">66,678</td><td class="cell-best">38.5</td></tr> | |
| <tr><td>8</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>XLM-R-base</div></td><td>Multilingual</td><td>RoBERTa (100 langs)</td><td>~1.1 GB</td><td>~93</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td>0.08</td></tr> | |
| <tr><td>9</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>mBERT</div></td><td>Multilingual</td><td>BERT (104 langs)</td><td>~700 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>10</td><td><div class="model-cell"><span class="model-type type-transformer"></span>viBERT</div></td><td>Transformer</td><td>BERT (10GB vi)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>11</td><td><div class="model-cell"><span class="model-type type-transformer"></span>viBERT4news</div></td><td>Transformer</td><td>BERT (news domain)</td><td>~400 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>12</td><td><div class="model-cell"><span class="model-type type-traditional"></span>MaxEnt baseline</div></td><td>Traditional</td><td>Maximum Entropy</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>13</td><td><div class="model-cell"><span class="model-type type-traditional"></span>CNN baseline</div></td><td>Traditional</td><td>Convolutional NN</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>59.74</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| <tr><td>14</td><td><div class="model-cell"><span class="model-type type-multilingual"></span>DistilmBERT</div></td><td>Multilingual</td><td>DistilBERT (multilingual)</td><td>~260 MB</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td><td>Hours (GPU)</td><td class="cell-na">n/a</td><td class="cell-na">n/a</td></tr> | |
| </tbody> | |
| </table> | |
| </div> | |
| <p style="font-size:0.75rem;color:var(--text2);margin-top:0.5rem"> | |
| * PhoBERT VNTC estimate based on similar Vietnamese classification tasks. Blank cells indicate benchmark not evaluated. | |
| <br>Efficiency = VNTC Accuracy / Model Size in MB. Higher is better. | |
| </p> | |
| </div> | |
| <!-- ===================== HOW TO DO SCIENCE ===================== --> | |
| <div class="panel" id="panel-science"> | |
| <div class="section-title">How to Do Science</div> | |
| <div class="section-desc">Research methodology guide compiled from Hamming, Schulman, Marek Rei, and Microsoft Research Asia.</div> | |
| <h3>1. Choosing Important Problems</h3> | |
| <blockquote>"If you do not work on an important problem, it's unlikely you'll do important work." <strong>-- Richard Hamming</strong></blockquote> | |
| <div class="principle"> | |
| <h4>Hamming's Principles</h4> | |
| <ul> | |
| <li>Maintain a list of <strong>10-20 important problems</strong> in your field</li> | |
| <li>A problem becomes important when you have a <strong>reasonable attack</strong></li> | |
| <li>Dedicate deep thinking time (Friday "Great Thoughts Time")</li> | |
| <li>Have <strong>courage</strong> to pursue unconventional ideas</li> | |
| </ul> | |
| </div> | |
| <div class="principle"> | |
| <h4>Schulman's Framework (OpenAI)</h4> | |
| <ul> | |
| <li><strong>Work on the right problems</strong></li> | |
| <li><strong>Make continual progress</strong></li> | |
| <li><strong>Achieve continual personal growth</strong></li> | |
| </ul> | |
| <p style="margin-top:0.5rem">Develop "research taste" by reading broadly, collaborating widely, and asking: "If this succeeds, how big is the impact?"</p> | |
| </div> | |
| <div class="principle"> | |
| <h4>Microsoft Research Asia (Dr. Ming Zhou)</h4> | |
| <ul> | |
| <li>Read recent <strong>ACL proceedings</strong> to find your field</li> | |
| <li>Target <strong>"blue ocean"</strong> areas - new fields with less competition</li> | |
| <li>Verify 3 prerequisites: math/ML framework, standard datasets, active research teams</li> | |
| <li>Find <strong>gaps</strong>: what can be improved, combined, or inverted</li> | |
| </ul> | |
| </div> | |
| <h3>2. Reading Papers</h3> | |
| <div class="principle"> | |
| <h4>Effective Reading Strategy</h4> | |
| <ul> | |
| <li><strong>Read broadly</strong>: Not just NLP - also cognitive science, neuroscience, linguistics, vision</li> | |
| <li><strong>Read deeply</strong>: Become the "world-leading expert" on your narrow question</li> | |
| <li><strong>Read textbooks</strong>: More knowledge-dense than papers</li> | |
| <li><strong>Follow citation chains</strong> via Google Scholar, Semantic Scholar</li> | |
| <li>Use <strong>PRISMA</strong> methodology for systematic reviews</li> | |
| </ul> | |
| </div> | |
| <h3>3. Running Experiments</h3> | |
| <div class="principle"> | |
| <h4>Step 1: Reproduce baselines first</h4> | |
| <p>"Reimplement existing state-of-the-art work first to validate your setup." -- Marek Rei</p> | |
| <ul> | |
| <li>Choose open source project, compile, run demo, <strong>match results</strong></li> | |
| <li>Understand the algorithm deeply, then reimplement</li> | |
| <li>Test on standard test set until results align</li> | |
| </ul> | |
| </div> | |
| <div class="principle"> | |
| <h4>Step 2: Simple baseline (1-2 weeks)</h4> | |
| <p>Implement the simplest approach before building complex architectures. Verify your setup works.</p> | |
| </div> | |
| <div class="principle"> | |
| <h4>Step 3: Rigorous experimentation</h4> | |
| <ul> | |
| <li><strong>Debug</strong>: Don't assume bug-free code. Test with toy examples. Add assertions.</li> | |
| <li><strong>Evaluate</strong>: Separate train/dev/test. Run 10+ times. Report mean + std.</li> | |
| <li><strong>Ablate</strong>: Significance tests and ablation studies for every novel component.</li> | |
| <li><strong>Avoid</strong>: Single-run results, weak-only baselines, blind trend-following.</li> | |
| </ul> | |
| </div> | |
| <h3>4. Writing Papers</h3> | |
| <div class="principle"> | |
| <h4>ACL Paper Structure (Dr. Ming Zhou)</h4> | |
| <ul> | |
| <li><strong>Title</strong>: Specific, no generic words</li> | |
| <li><strong>Abstract</strong>: Problem + Method + Advantage + Achievement</li> | |
| <li><strong>Introduction</strong>: Background → existing → limitations → contribution (≤3 points)</li> | |
| <li><strong>Related Work</strong>: Organized by <strong>theme</strong>, not chronology</li> | |
| <li><strong>Methodology</strong>: Problem definition → notation → formulas</li> | |
| <li><strong>Experiments</strong>: Purpose → data → parameters → reproducibility</li> | |
| <li><strong>Limitations</strong>: Required by ACL - honest assessment</li> | |
| </ul> | |
| <p style="margin-top:0.5rem"><strong>Revision</strong>: 3 passes - self review → team review → outsider review.</p> | |
| </div> | |
| <h3>5. Mindset & Habits</h3> | |
| <table> | |
| <tr><th>Principle</th><th>Lesson</th></tr> | |
| <tr><td><strong>Open doors</strong></td><td>Stay connected to the community; know emerging problems</td></tr> | |
| <tr><td><strong>Preparation</strong></td><td>"Luck favors the prepared mind" (Pasteur)</td></tr> | |
| <tr><td><strong>Constraints</strong></td><td>Difficult conditions often lead to breakthroughs</td></tr> | |
| <tr><td><strong>Commitment</strong></td><td>Deep immersion activates subconscious problem-solving</td></tr> | |
| <tr><td><strong>Selling work</strong></td><td>Presentation matters - great work needs effective communication</td></tr> | |
| </table> | |
| <h3>Essential Reading</h3> | |
| <div class="papers-grid" style="margin-top:0.75rem"> | |
| <div class="paper-card"> | |
| <div class="paper-title"><a href="https://www.cs.virginia.edu/~robins/YouAndYourResearch.html" target="_blank">You and Your Research</a></div> | |
| <div class="paper-meta"><span>Richard Hamming</span> · <span>Choosing important problems, mindset</span></div> | |
| </div> | |
| <div class="paper-card"> | |
| <div class="paper-title"><a href="http://joschu.net/blog/opinionated-guide-ml-research.html" target="_blank">An Opinionated Guide to ML Research</a></div> | |
| <div class="paper-meta"><span>John Schulman (OpenAI)</span> · <span>Problem selection, progress, growth</span></div> | |
| </div> | |
| <div class="paper-card"> | |
| <div class="paper-title"><a href="https://www.marekrei.com/blog/ml-nlp-research-project-advice/" target="_blank">ML/NLP Research Project Advice</a></div> | |
| <div class="paper-meta"><span>Marek Rei</span> · <span>Practical experiment workflow</span></div> | |
| </div> | |
| <div class="paper-card"> | |
| <div class="paper-title"><a href="https://microsoft.com/en-us/research/lab/microsoft-research-asia/articles/make-first-accomplishment-nlp-field/" target="_blank">How to Make First Accomplishment in NLP</a></div> | |
| <div class="paper-meta"><span>Dr. Ming Zhou (MSRA)</span> · <span>NLP research methodology</span></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div><!-- /content --> | |
| <script> | |
| // Tab switching | |
| document.querySelectorAll('.tab').forEach(tab => { | |
| tab.addEventListener('click', () => { | |
| document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); | |
| document.querySelectorAll('.panel').forEach(p => p.classList.remove('active')); | |
| tab.classList.add('active'); | |
| document.getElementById('panel-' + tab.dataset.tab).classList.add('active'); | |
| }); | |
| }); | |
| // Category filters | |
| document.querySelectorAll('.filter-btn').forEach(btn => { | |
| btn.addEventListener('click', () => { | |
| document.querySelectorAll('.filter-btn').forEach(b => b.classList.remove('active')); | |
| btn.classList.add('active'); | |
| const cat = btn.dataset.cat; | |
| document.querySelectorAll('.paper-card[data-cats]').forEach(card => { | |
| if (cat === 'all' || card.dataset.cats === cat) { | |
| card.style.display = ''; | |
| } else { | |
| card.style.display = 'none'; | |
| } | |
| }); | |
| }); | |
| }); | |
| // Abstract expand/collapse | |
| document.querySelectorAll('.paper-abstract').forEach(abs => { | |
| abs.addEventListener('click', () => { | |
| abs.classList.toggle('expanded'); | |
| }); | |
| abs.style.cursor = 'pointer'; | |
| }); | |
| // Leaderboard table sorting | |
| (function() { | |
| const table = document.getElementById('lb-main-table'); | |
| if (!table) return; | |
| const thead = table.querySelector('thead'); | |
| const tbody = table.querySelector('tbody'); | |
| let sortCol = -1, sortAsc = true; | |
| function parseVal(td) { | |
| let txt = td.textContent.trim(); | |
| if (txt === 'n/a' || txt === '-' || txt === 'SOTA') return -Infinity; | |
| txt = txt.replace(/[~,/s]/g, '').replace('MB', '').replace('GB', function() { return ''; }); | |
| // Handle GB -> convert | |
| let raw = td.textContent.trim(); | |
| if (raw.includes('GB')) { | |
| let n = parseFloat(raw.replace(/[~]/g, '')); | |
| return n * 1000; | |
| } | |
| if (raw.includes('MB')) return parseFloat(raw.replace(/[~]/g, '')); | |
| if (raw.includes('min')) return parseFloat(raw) * 60; | |
| if (raw.includes('Hours')) return 36000; | |
| if (raw.endsWith('s') && !raw.includes('/s')) return parseFloat(raw); | |
| let num = parseFloat(txt.replace(/[^0-9.\-]/g, '')); | |
| return isNaN(num) ? -Infinity : num; | |
| } | |
| thead.querySelectorAll('th').forEach((th, i) => { | |
| th.addEventListener('click', () => { | |
| if (sortCol === i) { sortAsc = !sortAsc; } | |
| else { sortCol = i; sortAsc = false; } | |
| const rows = Array.from(tbody.querySelectorAll('tr')); | |
| rows.sort((a, b) => { | |
| let va = parseVal(a.children[i]); | |
| let vb = parseVal(b.children[i]); | |
| if (th.dataset.type === 'str') { | |
| va = a.children[i].textContent.trim().toLowerCase(); | |
| vb = b.children[i].textContent.trim().toLowerCase(); | |
| return sortAsc ? va.localeCompare(vb) : vb.localeCompare(va); | |
| } | |
| return sortAsc ? va - vb : vb - va; | |
| }); | |
| // Re-number | |
| rows.forEach((r, idx) => { r.children[0].textContent = idx + 1; tbody.appendChild(r); }); | |
| // Update arrows | |
| thead.querySelectorAll('.sort-arrow').forEach(a => a.remove()); | |
| const arrow = document.createElement('span'); | |
| arrow.className = 'sort-arrow'; | |
| arrow.textContent = sortAsc ? '\u25B2' : '\u25BC'; | |
| th.appendChild(arrow); | |
| }); | |
| }); | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |