Spaces:

Kalana001
/

SinCode

Running

App Files Files Community

KalanaPabasara commited on 21 days ago

Commit

f6f45d5

0 Parent(s):

SinCode v3 — ByT5 seq2seq + XLM-RoBERTa MLM reranker

Browse files

Files changed (24) hide show

.gitignore +28 -0
app.py +75 -0
architecture.html +111 -0
architecture.mmd +72 -0
core/__init__.py +0 -0
core/constants.py +35 -0
core/decoder.py +248 -0
core/english.py +73 -0
core/mappings.py +8 -0
english_20k.txt +0 -0
misc/dataset_110.csv +111 -0
misc/dataset_40.csv +41 -0
misc/evaluate.py +446 -0
misc/upload_mlm_to_hf.py +50 -0
misc/v2_baseline.json +10 -0
requirements.txt +6 -0
seq2seq/__init__.py +1 -0
seq2seq/compose_fix_map.json +46 -0
seq2seq/finetune_corrections.py +270 -0
seq2seq/infer.py +83 -0
seq2seq/mbart_infer.py +126 -0
seq2seq/prepare_data.py +94 -0
seq2seq/train.py +185 -0
sincode_model.py +16 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Virtual environment
+.venv/
+# Model weights — hosted on Hugging Face Hub
+seq2seq/byt5-singlish-sinhala/
+seq2seq/tokenized_cache/
+# Large training data
+seq2seq/wsd_pairs.csv
+# Backup weights
+*.safetensors.bak
+*_backup.safetensors
+final_pre_correction_backup.safetensors
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+# Evaluation outputs (optional — remove if you want these tracked)
+misc/v3_results_110.csv
+# Misc
+*.log
+.DS_Store

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+SinCode v3 — Streamlit demo UI.
+Architecture: ByT5-small (seq2seq candidate generation) +
+              XLM-RoBERTa (MLM contextual reranking)
+Two transliteration modes:
+  • Code-Mixed  — ByT5 + MLM; retains English words where contextually apt
+  • Full Sinhala — mBart50 sentence-level; transliterates everything to Sinhala
+"""
+import streamlit as st
+from sincode_model import BeamSearchDecoder, SentenceTransliterator
+st.set_page_config(page_title="සිංCode v3", page_icon="🇱🇰", layout="centered")
+st.title("සිංCode v3")
+st.caption("ByT5 seq2seq + XLM-RoBERTa MLM reranking")
+@st.cache_resource(show_spinner="Loading models (ByT5 + XLM-RoBERTa)…")
+def load_decoder() -> BeamSearchDecoder:
+    return BeamSearchDecoder()
+@st.cache_resource(show_spinner="Loading mBart50 model…")
+def load_transliterator() -> SentenceTransliterator:
+    return SentenceTransliterator()
+mode = st.radio(
+    "Transliteration mode",
+    options=["Code-Mixed Output", "Full Sinhala Output"],
+    horizontal=True,
+    help=(
+        "**Code-Mixed**: keeps English technical/borrowed words where natural "
+        "(e.g. *buffer*, *bit rate*). "
+        "**Full Sinhala**: transliterates every word to Sinhala script "
+        "(e.g. *business* → ව්‍යාපාරය)."
+    ),
+)
+sentence = st.text_input(
+    "Enter Singlish sentence",
+    placeholder="e.g. mema videowe bit rate eka godak wadi nisa buffer wenawa",
+)
+show_trace = st.checkbox(
+    "Show step-by-step trace",
+    value=False,
+    disabled=(mode == "Full Sinhala Output"),
+    help="Trace is only available in Code-Mixed mode.",
+)
+if st.button("Transliterate", type="primary") and sentence.strip():
+    if mode == "Full Sinhala Output":
+        with st.spinner("Transliterating (mBart50)…"):
+            transliterator = load_transliterator()
+            result = transliterator.transliterate(sentence.strip())
+        st.markdown("### Result")
+        st.success(result)
+    else:
+        with st.spinner("Transliterating…"):
+            decoder = load_decoder()
+            result, trace_logs = decoder.decode(sentence.strip())
+        st.markdown("### Result")
+        st.success(result)
+        if show_trace:
+            st.markdown("### Trace")
+            for log in trace_logs:
+                st.markdown(log)

architecture.html ADDED Viewed

	@@ -0,0 +1,111 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <title>SinCode v3 — Architecture</title>
+  <script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
+  <style>
+    body {
+      font-family: sans-serif;
+      background: #f8f9fa;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      padding: 2rem;
+    }
+    h1 { color: #2c3e50; margin-bottom: 0.25rem; }
+    p  { color: #666; margin-top: 0; margin-bottom: 2rem; }
+    .mermaid {
+      background: white;
+      border-radius: 12px;
+      padding: 2rem;
+      box-shadow: 0 2px 12px rgba(0,0,0,0.08);
+      max-width: 1200px;
+      width: 100%;
+    }
+  </style>
+</head>
+<body>
+  <h1>SinCode v3 — System Architecture</h1>
+  <p>ByT5-small · XLM-RoBERTa · mBart50-large</p>
+  <div class="mermaid">
+flowchart TD
+    UI["🖥️ Streamlit UI\napp.py"]
+    MODE{Mode?}
+    UI --> MODE
+    subgraph MODE_FULL["Full Sinhala Mode"]
+        direction TB
+        ST["SentenceTransliterator\nseq2seq/mbart_infer.py"]
+        MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"]
+        FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"]
+        ST --> MBART
+        MBART -->|"raw Sinhala output"| FIX
+    end
+    subgraph MODE_MIXED["Code-Mixed Mode"]
+        direction TB
+        subgraph PHASE1["Phase 1 · Word Classification"]
+            direction LR
+            P1A["Sinhala script?\n(U+0D80–0DFF)"]
+            P1B["English vocab?\nenglish_20k.txt"]
+            P1C["Singlish\n(everything else)"]
+        end
+        subgraph PHASE2["Phase 2 · Candidate Generation  (single ByT5 batch)"]
+            direction LR
+            BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"]
+            SIN_PASS["Single candidate\n(word as-is)"]
+            ENG_CAND["English word\n+ ByT5 Sinhala alternatives"]
+            SIN_CAND["Top-5 ByT5\ncandidates"]
+        end
+        subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"]
+            direction LR
+            GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"]
+            RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"]
+            MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"]
+            SOFTMAX["Softmax normalise\npick argmax"]
+        end
+        PHASE1 --> PHASE2
+        P1A -->|Sinhala| SIN_PASS
+        P1B -->|English| ENG_CAND
+        P1C -->|Singlish| SIN_CAND
+        BYT5 --> ENG_CAND
+        BYT5 --> SIN_CAND
+        PHASE2 --> PHASE3
+        GREEDY --> MLM
+        MLM --> SOFTMAX
+        SOFTMAX --> RESCORE
+        RESCORE --> MLM
+    end
+    MODE -->|"Full Sinhala Output"| MODE_FULL
+    MODE -->|"Code-Mixed Output"| MODE_MIXED
+    MODE_FULL --> OUT["✅ Sinhala Output"]
+    MODE_MIXED --> OUT
+    subgraph MODELS["Models on Hugging Face Hub  (Kalana001)"]
+        HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"]
+        HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"]
+        HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"]
+    end
+    style MODE_FULL fill:#e8f4fd,stroke:#4a9eda
+    style MODE_MIXED fill:#fdf3e8,stroke:#e8974a
+    style PHASE1 fill:#fff9e6,stroke:#cca800
+    style PHASE2 fill:#e8fff0,stroke:#2ecc71
+    style PHASE3 fill:#f4e8ff,stroke:#9b59b6
+    style MODELS fill:#eaf4ee,stroke:#27ae60
+  </div>
+  <script>
+    mermaid.initialize({ startOnLoad: true, theme: 'default', flowchart: { curve: 'basis' } });
+  </script>
+</body>
+</html>

architecture.mmd ADDED Viewed

	@@ -0,0 +1,72 @@

+flowchart TD
+    UI["🖥️ Streamlit UI\napp.py"]
+    MODE{Mode?}
+    UI --> MODE
+    subgraph MODE_FULL["Full Sinhala Mode"]
+        direction TB
+        ST["SentenceTransliterator\nseq2seq/mbart_infer.py"]
+        MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"]
+        FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"]
+        ST --> MBART
+        MBART -->|"raw Sinhala output"| FIX
+    end
+    subgraph MODE_MIXED["Code-Mixed Mode"]
+        direction TB
+        subgraph PHASE1["Phase 1 · Word Classification"]
+            direction LR
+            P1A["Sinhala script?\n(U+0D80–0DFF)"]
+            P1B["English vocab?\nenglish_20k.txt"]
+            P1C["Singlish\n(everything else)"]
+        end
+        subgraph PHASE2["Phase 2 · Candidate Generation  (single ByT5 batch)"]
+            direction LR
+            BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"]
+            SIN_PASS["Single candidate\n(word as-is)"]
+            ENG_CAND["English word\n+ ByT5 Sinhala alternatives"]
+            SIN_CAND["Top-5 ByT5\ncandidates"]
+        end
+        subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"]
+            direction LR
+            GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"]
+            RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"]
+            MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"]
+            SOFTMAX["Softmax normalise\npick argmax"]
+        end
+        PHASE1 --> PHASE2
+        P1A -->|Sinhala| SIN_PASS
+        P1B -->|English| ENG_CAND
+        P1C -->|Singlish| SIN_CAND
+        BYT5 --> ENG_CAND
+        BYT5 --> SIN_CAND
+        PHASE2 --> PHASE3
+        GREEDY --> MLM
+        MLM --> SOFTMAX
+        SOFTMAX --> RESCORE
+        RESCORE --> MLM
+    end
+    MODE -->|"Full Sinhala Output"| MODE_FULL
+    MODE -->|"Code-Mixed Output"| MODE_MIXED
+    MODE_FULL --> OUT["✅ Sinhala Output"]
+    MODE_MIXED --> OUT
+    subgraph MODELS["Models on Hugging Face Hub  (Kalana001)"]
+        HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"]
+        HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"]
+        HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"]
+    end
+    style MODE_FULL fill:#e8f4fd,stroke:#4a9eda
+    style MODE_MIXED fill:#fdf3e8,stroke:#e8974a
+    style PHASE1 fill:#fff9e6,stroke:#cca800
+    style PHASE2 fill:#e8fff0,stroke:#2ecc71
+    style PHASE3 fill:#f4e8ff,stroke:#9b59b6
+    style MODELS fill:#eaf4ee,stroke:#27ae60

core/__init__.py ADDED Viewed

File without changes

core/constants.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Configuration constants for SinCode v3.
+Key difference from v2: no rule engine, no dictionary.
+Candidate generation is fully handled by the ByT5 seq2seq model.
+"""
+import re
+# ─── MLM Model Path ──────────────────────────────────────────────────────────
+# XLM-RoBERTa fine-tuned on Sinhala — reranks ByT5 candidates by context
+DEFAULT_MLM_MODEL = "Kalana001/xlm-roberta-base-finetuned-sinhala"
+# ─── ByT5 Transliterator Model Path ──────────────────────────────────────────
+# Fine-tuned on 1M Singlish→Sinhala pairs — hosted on Hugging Face Hub
+DEFAULT_BYT5_MODEL = "Kalana001/byt5-small-singlish-sinhala"
+# ─── mBart50 Transliterator Model Path ───────────────────────────────────────
+# Full-sentence Singlish→Sinhala (no English retained) — Hugging Face Hub
+DEFAULT_MBART_MODEL = "Kalana001/mbart50-large-singlish-sinhala"
+# ─── Corpus ───────────────────────────────────────────────────────────────────
+ENGLISH_CORPUS_URL = (
+    "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
+)
+# ─── Scoring Weights ─────────────────────────────────────────────────────────
+# Pure MLM — no manual weights needed
+# ─── Decoding Parameters ─────────────────────────────────────────────────────
+MAX_CANDIDATES: int = 5       # ByT5 beam=5 → 5 candidates per word
+MIN_ENGLISH_LEN: int = 3      # Min word length for English detection
+# ─── Regex ───────────────────────────────────────────────────────────────────
+PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")

core/decoder.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+SinCode v3 — ByT5 Seq2Seq + XLM-RoBERTa MLM Reranker.
+Pipeline (per word):
+  Sinhala script  → MLM scores in context (single candidate)
+  English vocab   → ByT5 generates Sinhala alternatives + English kept; MLM picks
+  Everything else → ByT5 generates top-5 candidates; MLM picks best
+"""
+import math
+import re
+import torch
+import logging
+from typing import List, Tuple, Optional
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from core.constants import (
+    DEFAULT_MLM_MODEL, DEFAULT_BYT5_MODEL,
+    MAX_CANDIDATES, MIN_ENGLISH_LEN,
+    PUNCT_PATTERN,
+)
+from core.english import ENGLISH_VOCAB
+from seq2seq.infer import Transliterator
+logger = logging.getLogger(__name__)
+_SINHALA_RE = re.compile(r"[\u0D80-\u0DFF]")
+class ScoredCandidate:
+    __slots__ = ("text", "mlm_score")
+    def __init__(self, text: str, mlm_score: float):
+        self.text = text
+        self.mlm_score = mlm_score
+def _is_sinhala(text: str) -> bool:
+    return bool(_SINHALA_RE.search(text))
+class BeamSearchDecoder:
+    """
+    SinCode v3 contextual decoder.
+    Replaces the rule engine + dictionary + hardcoded maps with a single
+    ByT5-small seq2seq model fine-tuned on 1,000,000 Singlish→Sinhala pairs.
+    XLM-RoBERTa reranks the top-5 beam candidates by masked-LM probability.
+    """
+    def __init__(
+        self,
+        mlm_model_name: str = DEFAULT_MLM_MODEL,
+        byt5_model_path: str = DEFAULT_BYT5_MODEL,
+        device: Optional[str] = None,
+    ):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Loading MLM reranker: %s", mlm_model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(mlm_model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(mlm_model_name)
+        self.model.to(self.device)
+        self.model.eval()
+        logger.info("Loading ByT5 transliterator: %s", byt5_model_path)
+        self.transliterator = Transliterator(model_path=byt5_model_path, device=self.device)
+    # ── Normalization ─────────────────────────────────────────────────────────
+    @staticmethod
+    def _softmax_normalize(raw_scores: List[float]) -> List[float]:
+        if not raw_scores:
+            return []
+        if len(raw_scores) == 1:
+            return [1.0]
+        max_s = max(raw_scores)
+        exps = [math.exp(s - max_s) for s in raw_scores]
+        total = sum(exps)
+        return [e / total for e in exps]
+    # ── MLM batch scoring ─────────────────────────────────────────────────────
+    def _batch_mlm_score(
+        self,
+        left_contexts: List[str],
+        right_contexts: List[str],
+        candidates: List[str],
+    ) -> List[float]:
+        """Score each candidate with XLM-RoBERTa multi-mask log-probability."""
+        if not candidates:
+            return []
+        mask = self.tokenizer.mask_token
+        mask_token_id = self.tokenizer.mask_token_id
+        cand_token_ids: List[List[int]] = []
+        for c in candidates:
+            ids = self.tokenizer.encode(c, add_special_tokens=False)
+            cand_token_ids.append(ids if ids else [self.tokenizer.unk_token_id])
+        batch_texts: List[str] = []
+        for i in range(len(candidates)):
+            n_masks = len(cand_token_ids[i])
+            mask_str = " ".join([mask] * n_masks)
+            parts = [p for p in [left_contexts[i], mask_str, right_contexts[i]] if p]
+            batch_texts.append(" ".join(parts))
+        inputs = self.tokenizer(
+            batch_texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        ).to(self.device)
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        scores: List[float] = []
+        for i, target_ids in enumerate(cand_token_ids):
+            token_ids = inputs.input_ids[i]
+            mask_positions = (token_ids == mask_token_id).nonzero(as_tuple=True)[0]
+            if mask_positions.numel() == 0 or not target_ids:
+                scores.append(-100.0)
+                continue
+            n = min(len(target_ids), mask_positions.numel())
+            total = 0.0
+            for j in range(n):
+                pos = mask_positions[j].item()
+                log_probs = torch.log_softmax(logits[i, pos, :], dim=0)
+                total += log_probs[target_ids[j]].item()
+            scores.append(total / n)
+        return scores
+    # ── Public decode ─────────────────────────────────────────────────────────
+    def decode(self, sentence: str) -> Tuple[str, List[str]]:
+        """
+        Decode a Singlish sentence word-by-word using ByT5 + XLM-RoBERTa MLM.
+        Returns (transliterated_sentence, trace_logs).
+        """
+        words = sentence.split()
+        if not words:
+            return "", []
+        # ── Phase 1: batch ByT5 candidate generation ──────────────────────────
+        # Collect only the words that need ByT5 (non-Sinhala), run in one pass
+        cores: List[str] = []
+        core_meta: List[tuple] = []  # (index_into_words, prefix, core, suffix, core_lower)
+        for i, raw in enumerate(words):
+            match = PUNCT_PATTERN.match(raw)
+            prefix, core, suffix = match.groups() if match else ("", raw, "")
+            if not _is_sinhala(core):
+                cores.append(core)
+                core_meta.append((i, prefix, core, suffix, core.lower()))
+        # Single ByT5 forward pass for all non-Sinhala words
+        byt5_results: List[List[str]] = (
+            self.transliterator.batch_candidates(cores, k=MAX_CANDIDATES)
+            if cores else []
+        )
+        byt5_map: dict = {}  # word index → list of raw ByT5 strings
+        for (i, prefix, core, suffix, core_lower), cands in zip(core_meta, byt5_results):
+            byt5_map[i] = (prefix, suffix, core_lower, cands or [core])
+        word_infos: List[dict] = []
+        for i, raw in enumerate(words):
+            match = PUNCT_PATTERN.match(raw)
+            _, core, _ = match.groups() if match else ("", raw, "")
+            if _is_sinhala(core):
+                word_infos.append({"kind": "sinhala", "candidates": [raw]})
+                continue
+            prefix, suffix, core_lower, byt5_cands = byt5_map[i]
+            sinhala_cands = [prefix + c + suffix for c in byt5_cands]
+            if core_lower in ENGLISH_VOCAB and len(core_lower) >= MIN_ENGLISH_LEN:
+                candidates = [raw] + [c for c in sinhala_cands if c != raw]
+                word_infos.append({"kind": "english", "candidates": candidates[:MAX_CANDIDATES + 1]})
+            else:
+                word_infos.append({"kind": "singlish", "candidates": sinhala_cands})
+        # ── Phase 2: greedy left-to-right pass (builds dynamic left context) ──
+        # Right context is seeded from first ByT5 candidate (pre-decode estimate)
+        stable_right = [info["candidates"][0] for info in word_infos]
+        selected_words: List[str] = []
+        for t, info in enumerate(word_infos):
+            candidates = info["candidates"]
+            left_ctx = " ".join(selected_words)
+            right_ctx = " ".join(stable_right[t + 1:])
+            raw_mlm = self._batch_mlm_score(
+                [left_ctx] * len(candidates),
+                [right_ctx] * len(candidates),
+                candidates,
+            )
+            norm_mlm = self._softmax_normalize(raw_mlm)
+            best = max(zip(candidates, norm_mlm), key=lambda x: x[1])
+            selected_words.append(best[0])
+        # ── Phase 3: re-score with full decoded sentence as context ───────────
+        # Right context is now the actual decoded output, not the pre-decode estimate
+        trace_logs: List[str] = []
+        final_words: List[str] = []
+        for t, info in enumerate(word_infos):
+            raw_word = words[t]
+            kind = info["kind"]
+            candidates = info["candidates"]
+            left_ctx = " ".join(final_words)
+            right_ctx = " ".join(selected_words[t + 1:])
+            raw_mlm = self._batch_mlm_score(
+                [left_ctx] * len(candidates),
+                [right_ctx] * len(candidates),
+                candidates,
+            )
+            norm_mlm = self._softmax_normalize(raw_mlm)
+            scored = sorted(
+                [ScoredCandidate(text=c, mlm_score=norm_mlm[i]) for i, c in enumerate(candidates)],
+                key=lambda x: x.mlm_score,
+                reverse=True,
+            )
+            best = scored[0]
+            final_words.append(best.text)
+            if kind == "sinhala":
+                trace_logs.append(
+                    f"**Step {t+1}: `{raw_word}`** → `{best.text}` "
+                    f"(Sinhala, MLM={best.mlm_score:.3f})\n"
+                )
+            else:
+                trace_logs.append(
+                    f"**Step {t+1}: `{raw_word}`** → `{best.text}` "
+                    f"(MLM={best.mlm_score:.3f})\n"
+                    + "\n".join(f"  - `{s.text}` {s.mlm_score:.3f}" for s in scored)
+                )
+        return " ".join(final_words), trace_logs

core/english.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+English vocabulary loader for SinCode v3.
+Used for English passthrough detection in the decoder.
+Loads purely from the 20k corpus file — no hardcoded word lists.
+"""
+import os
+import logging
+import requests
+from typing import Set
+from core.constants import ENGLISH_CORPUS_URL, MIN_ENGLISH_LEN
+logger = logging.getLogger(__name__)
+def _resolve_english_cache_path() -> str:
+    override = os.getenv("SINCODE_ENGLISH_CACHE")
+    if override:
+        return override
+    candidates = [
+        os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
+        os.path.join(os.getcwd(), "english_20k.txt"),
+        os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
+    ]
+    for path in candidates:
+        if not path:
+            continue
+        parent = os.path.dirname(path) or "."
+        try:
+            os.makedirs(parent, exist_ok=True)
+            with open(path, "a", encoding="utf-8"):
+                pass
+            return path
+        except OSError:
+            continue
+    return "english_20k.txt"
+ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()
+def load_english_vocab() -> Set[str]:
+    vocab: Set[str] = set()
+    if not os.path.exists(ENGLISH_CORPUS_CACHE) or os.path.getsize(ENGLISH_CORPUS_CACHE) == 0:
+        try:
+            logger.info("Downloading English corpus...")
+            response = requests.get(ENGLISH_CORPUS_URL, timeout=10)
+            response.raise_for_status()
+            with open(ENGLISH_CORPUS_CACHE, "wb") as f:
+                f.write(response.content)
+        except (requests.RequestException, OSError) as exc:
+            logger.warning("Could not download English corpus: %s", exc)
+            return vocab
+    try:
+        with open(ENGLISH_CORPUS_CACHE, "r", encoding="utf-8") as f:
+            vocab.update(
+                w for line in f
+                if (w := line.strip().lower()) and len(w) >= MIN_ENGLISH_LEN
+            )
+    except OSError as exc:
+        logger.warning("Could not read English corpus file: %s", exc)
+    logger.info("English vocabulary loaded: %d words", len(vocab))
+    return vocab
+ENGLISH_VOCAB: Set[str] = load_english_vocab()

core/mappings.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+core/mappings.py — deprecated.
+All manual Singlish→Sinhala mappings have been removed.
+Correction pairs are in seq2seq/finetune_corrections.py and baked into
+the ByT5 model weights via targeted correction fine-tuning.
+Candidate generation is handled end-to-end by the ByT5 seq2seq model.
+"""

english_20k.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

misc/dataset_110.csv ADDED Viewed

	@@ -0,0 +1,111 @@

+id,input,reference,split,has_code_mix,has_ambiguity,domain,notes
+1,api kalin katha kala,අපි කලින් කතා කළා,test,0,0,general,pure singlish
+2,eka honda wage thiyanawa,ඒක හොඳ වගේ තියෙනවා,test,0,1,general,wage=seems
+3,meheta thadata wessa,මෙහෙට තදට වැස්සා,test,0,1,general,thadata=very
+4,oya kiwwata mama giye,ඔයා කිව්වට මම ගියේ,test,0,0,general,contextual past
+5,mama danne na eka gena,මම දන්නෙ නෑ ඒක ගැන,test,0,1,general,eka pronoun
+6,oya awa wage na,ඔයා ආවා වගේ නෑ,test,0,1,general,wage=seems
+7,ekat ynna bri,ඒකට යන්න බැරි,test,0,0,general,ad-hoc bri=bari
+8,mama inne gedaradi,මම ඉන්නෙ ගෙදරදී,test,0,0,general,pure singlish
+9,eka heta balamu,ඒක හෙට බලමු,test,0,0,general,eka pronoun
+10,klya madi api passe yamu,කාලය මදි අපි පස්සෙ යමු,test,0,0,general,ad-hoc klya=kalaya
+11,assignment eka ada submit karanna one,assignment එක අද submit කරන්න ඕනෙ,test,1,0,education,eka after English noun
+12,exam hall eka nisa mama baya una,exam hall එක නිසා මම බය උනා,test,1,1,education,nisa=because
+13,results blnna one,results බලන්න ඕනෙ,test,1,0,education,ad-hoc blnna=balanna
+14,study group ekak hadamu,study group එකක් හදමු,test,1,0,education,ekak after English noun
+15,viva ekta prepared wage na,viva එකට prepared වගේ නෑ,test,1,1,education,wage=seems
+16,mta project ek submit krnna one,මට project එක submit කරන්න ඕනෙ,test,1,0,education,ad-hoc mta krnna
+17,hta parikshanaya thiyanawa,හෙට පරික්‍ෂණය තියෙනවා,test,0,0,education,ad-hoc hta=heta
+18,mama potha kiyawala iwara kala,මම පොත කියවලා ඉවර කළා,test,0,0,education,pure singlish
+19,prkku nisa api kalin giya,පරක්කු නිසා අපි කලින් ගියා,test,0,1,education,nisa=because
+20,prashnaya hondai wage penenawa,ප්‍රශ්නය හොඳයි වගේ පේනවා,test,0,1,education,wage=seems
+21,deployments nisa site down wuna,deployments නිසා site down උනා,test,1,1,work,nisa=because
+22,PR eka merge karanna one,PR එක merge කරන්න ඕනෙ,test,1,0,work,eka after English noun
+23,backlog eka update kala,backlog එක update කළා,test,1,0,work,eka after English noun
+24,server down nisa work karanna ba,server down නිසා work කරන්න බෑ,test,1,1,work,nisa=because
+25,meeting eka tomorrow damu,meeting එක tomorrow දාමු,test,1,0,work,code-mix preserved
+26,feedback nisa redo karanna una,feedback නිසා redo කරන්න උනා,test,1,1,work,nisa=because
+27,ape wada ada iwara wenawa,අපේ වැඩ අද ඉවර වෙනවා,test,0,0,work,pure singlish
+28,kalamanakaru hitpu nisa api katha kala,කලමනාකරු හිටපු නිසා අපි කතා කළා,test,0,1,work,nisa=because; known failure (complex OOV)
+29,me wada hondai wage penawa,මේ වැඩ හොඳයි වගේ පේනවා,test,0,1,work,wage=seems
+30,wada tika ada iwara karamu,වැඩ ටික අද ඉවර කරමු,test,0,0,work,pure singlish
+31,story eke poll ekak damma,story එකේ poll එකක් දැම්මා,test,1,0,social,eke and ekak forms
+32,oyata DM ekak yawwa,ඔයාට DM එකක් යැව්වා,test,1,0,social,ekak after English noun
+33,comment eka delete kala nisa mama danne na,comment එක delete කළා නිසා මම දන්නෙ නෑ,test,1,1,social,"nisa=because; known failure (කළා/කල, දන්නෙ/දන්නේ)"
+34,selfie ekak gannako,selfie එකක් ගන්නකෝ,test,1,0,social,ekak after English noun
+35,post eka private nisa share karanna epa,post එක private නිසා share කරන්න එපා,test,1,1,social,nisa=because
+36,oyta message krnna one,ඔයාට message කරන්න ඕනෙ,test,1,0,social,ad-hoc oyta krnna on=one
+37,api passe katha karamu,අපි පස්සෙ කතා කරමු,test,0,0,social,pure singlish
+38,eya laga pinthurayk thiyanawa,ඒයා ළඟ පින්තූරයක් තියෙනවා,test,0,0,social,ad-hoc pinthurayk
+39,oya awa wage mata hithenawa,ඔයා ආවා වගේ මට හිතෙනවා,test,0,1,social,wage=seems
+40,api passe hambawemu,අපි පස්සෙ හම්බවෙමු,test,0,0,social,pure singlish
+41,phone eka charge karanna one,phone එක charge කරන්න ඕනෙ,test,1,0,general,NEW: general code-mix (gap fix)
+42,bus eka late una,bus එක late උනා,test,1,0,general,NEW: general code-mix
+43,mama online inne,මම online ඉන්නෙ,test,1,0,general,NEW: English mid-sentence
+44,time nathi nisa heta yamu,time නැති නිසා හෙට යමු,test,1,1,general,NEW: English+nisa in general
+45,oya call eka ganna,ඔයා call එක ගන්න,test,1,0,general,NEW: general code-mix eka pattern
+46,api game yanawa heta,අපි ගමේ යනවා හෙට,test,0,1,general,NEW: game=ගමේ(village) ambig with English 'game'
+47,man heta enne na,මන් හෙට එන්නෙ නෑ,test,0,1,general,NEW: man=මං(I) ambig with English 'man'
+48,eka hari lassanai,ඒක හරි ලස්සනයි,test,0,1,general,NEW: hari=very (not OK/correct)
+49,oya kiwwa hari,ඔයා කිව්වා හරි,test,0,1,general,NEW: hari=correct (not very)
+50,kalaya ithuru krganna one,කලය ඉතුරු කරගන්න ඕනෙ,test,0,1,general,NEW: one=ඕනෙ(need) ambig with English 'one'
+51,date eka fix karanna one,date එක fix කරන්න ඕනෙ,test,1,1,general,NEW: date=English preserve; one=ඕනෙ
+52,rata yanna one,රට යන්න ඕනෙ,test,0,0,general,"NEW: rata=country, pure singlish"
+53,game eke leaderboard eka balanna,game එකේ leaderboard එක බලන්න,test,1,1,social,NEW: game=English(video game) not ගමේ
+54,api thamai hodama,අපි තමයි හොඳම,test,0,1,general,NEW: thamai=emphatic we; hodama=best; looks English but Singlish
+55,mama heta udee enawa oya enakota message ekk dnna,මම හෙට උදේ එනවා ඔයා එනකොට message එකක් දාන්න,test,0,0,general,NEW: 8-word pure singlish
+56,ape gedara langa thiyana kadeta yanna one,අපේ ගෙදර ළඟ තියෙන කඩේට යන්න ඕනෙ,test,0,0,general,NEW: 7-word with ළඟ
+57,mama assignment eka karala submit karanawa ada raa,මම assignment එක කරලා submit කරනවා අද රෑ,test,1,0,education,NEW: 8-word code-mix long
+58,oya enne naththe mokada kiyla mama danne na,ඔයා එන්නෙ නැත්තෙ මොකද කියලා මම දන්නෙ නෑ,test,0,0,general,NEW: 9-word complex clause
+59,client ekka call karala feedback eka ahanna one,client එක්ක call කරලා feedback එක අහන්න ඕනෙ,test,1,0,work,NEW: 8-word heavy code-mix
+60,mama gedara gihilla kewata passe call karannm,මම ගෙදර ගිහිල්ලා කෑවට පස්සෙ call කරන්නම්,test,1,0,general,NEW: 8-word code-mix + temporal
+61,laptop eke software update karanna one,laptop එකේ software update කරන්න ඕනෙ,test,1,0,work,NEW: 3 English words consecutive
+62,office eke wifi password eka mokakda,office එකේ wifi password එක මොකක්ද,test,1,0,work,NEW: 3 English words; question
+63,online order eka track karanna ba,online order එක track කරන්න බෑ,test,1,0,general,NEW: 3 English words
+64,email eke attachment eka download karanna,email එකේ attachment එක download කරන්න,test,1,0,work,NEW: 3 English words + double eka
+65,Instagram story eke filter eka hadanna,Instagram story එකේ filter එක හදන්න,test,1,0,social,NEW: 4 English words; social media
+66,oyge wada iwra krd,ඔයාගෙ වැඩ ඉවර කරාද,test,0,0,general,NEW: extreme vowel omission
+67,mge phone ek hack una,මගේ phone එක hack උනා,test,1,0,general,"NEW: heavy ad-hoc mmge=mage, hrk=hack"
+68,handawata ynna wenwa,හැන්දෑවට යන්න වෙනවා,test,0,0,general,"NEW: ad-hoc hndta=handeta, wenwa=wenawa"
+69,prashnya krnna oni,ප්‍රශ්‍නය කරන්න ඕනි,test,0,0,education,NEW: replaced extreme ad-hoc with more readable form
+70,apita gdra ynna oni,අපිට ගෙදර යන්න ඕනි,test,0,0,general,NEW: ad-hoc gdra=gedara
+71,mama oyata kiwwa,මම ඔයාට කිව්වා,test,0,0,general,"NEW: common words only (mama, oyata)"
+72,oya hari hondai,ඔයා හරි හොඳයි,test,0,1,general,NEW: hari=very; common words
+73,api heta yamu,අපි හෙට යමු,test,0,0,general,NEW: common words bypass test
+74,app eka crash wenawa phone eke,app එක crash වෙනවා phone එකේ,test,1,0,technology,NEW: tech domain
+75,code eka push karanna github ekata,code එක push කරන්න github එකට,test,1,0,technology,NEW: dev workflow code-mix
+76,database eka slow nisa query eka optimize karanna one,database එක slow නිසා query එක optimize කරන්න ඕනෙ,test,1,1,technology,NEW: heavy tech code-mix + nisa; long
+77,bug eka fix kala merge karanna,bug එක fix කළා merge කරන්න,test,1,0,technology,NEW: sequential actions code-mix
+78,internet eka slow wage thiyanawa,internet එක slow වගේ තියෙනවා,test,1,1,technology,NEW: tech + wage ambiguity
+79,kema hodai ada,කෑම හොඳයි අද,test,0,0,daily_life,NEW: daily life; short
+80,mama bus eke enawa,මම bus එකේ එනවා,test,1,0,daily_life,NEW: transport code-mix
+81,ganu depala ekka market giya,ගෑනු දෙපල එක්ක market ගියා,test,1,0,daily_life,NEW: colloquial + code-mix
+82,watura bonna one,වතුර බොන්න ඕනෙ,test,0,0,daily_life,NEW: health advice singlish
+83,shop eke sugar nati nisa mama giye na,shop එකේ sugar නැති නිසා මම ගියේ නෑ,test,1,1,daily_life,NEW: daily code-mix + nisa; negative
+84,hri hari,හරි හරි,test,0,0,general,NEW: 2-word repetition; common expression + ad-hoc hri=hari
+85,mta ep,මට එපා,test,0,0,general,NEW: ad-hoc mta=mata ep=epa
+86,ok hari,ok හරි,test,1,0,general,NEW: 2-word code-mix
+87,ape game hari dewal wenne,අපේ ගමේ හරි දේවල් වෙන්නේ,test,0,1,general,"NEW: game=village, hari=nice; looks English"
+88,mta dan one na,මට දැන් ඕනෙ නෑ,test,0,1,general,NEW: man+one look English but Singlish
+89,eka hari hondai wage dnuna nisa mama giya,ඒක හරි හොඳයි වගේ දැනුනා නිසා මම ගියා,test,0,1,general,NEW: hari+wage+nisa triple ambiguity; ref corrected to හොඳයි
+90,game eke mission hari amarui,game එකේ mission හරි අමාරුයි,test,0,1,general,NEW: game=video game hari=very amarui=difficult; looks English but Singlish
+91,mama heta yanawa,මම හෙට යනවා,test,0,0,general,NEW: future tense
+92,ey iye aawa,එයා ඊයේ ආවා,test,0,0,general,NEW: past tense
+93,api dan yanawa,අපි දැන් යනවා,test,0,0,general,NEW: present tense
+94,video eka balanna one,video එක බලන්න ඕනෙ,test,1,0,social,NEW: eka definite article
+95,video ekak hadamu,video එකක් හදමු,test,1,0,social,NEW: ekak indefinite
+96,video eke comment eka balanna,video එකේ comment එක බලන්න,test,1,0,social,NEW: eke possessive + double eka
+97,video ekata like ekak danna,video එකට like එකක් දාන්න,test,1,0,social,NEW: ekata dative case
+98,lecture eka record karala share karanna,lecture එක record කරලා share කරන්න,test,1,0,education,NEW: sequential code-mix actions
+99,research paper eka liyanna one heta wge,research paper එක ලියන්න ඕනෙ හෙට වගේ,test,1,0,education,NEW: long + temporal; 8 words
+100,exam eka hari amarui,exam එක හරි අමාරුයි,test,1,1,education,NEW: hari=very; difficulty context
+101,sprint eka plan karamu Monday,sprint එක plan කරමු Monday,test,1,0,work,NEW: day name preserved
+102,ape team eka deadline ekata kala,අපේ team එක deadline එකට කළා,test,1,0,work,NEW: possessive + double English
+103,standup eke mokada kiwwe,standup එකේ මොකද කිව්වෙ,test,1,0,work,NEW: question form code-mix
+104,reel eka viral una,reel එක viral උනා,test,1,0,social,NEW: social media terminology
+105,group chat eke mokada wenne,group chat එකේ මොකද වෙන්නෙ,test,1,0,social,NEW: compound English + question
+106,oyge profile picture eka lassanai,ඔයාගෙ profile picture එක ලස්සනයි,test,1,0,social,NEW: compound English noun + eka; ref corrected to ඔයාගෙ
+107,mama enne na heta,මම එන්නෙ නෑ හෙට,test,0,0,general,NEW: negation at end
+108,eka karanna epa,ඒක කරන්න එපා,test,0,0,general,NEW: prohibition form
+109,kawruwath enne na,කවුරුවත් එන්නෙ නෑ,test,0,0,general,NEW: nobody negation
+110,oya koheda ynne,ඔයා කොහේද යන්නේ,test,0,0,general,NEW: question form where

misc/dataset_40.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+id,input,reference,split,has_code_mix,has_ambiguity,domain,notes
+1,api kalin katha kala,අපි කලින් කතා කළා,train,0,0,general,pure singlish
+2,eka honda wage thiyanawa,ඒක හොඳ වගේ තියෙනවා,train,0,1,general,wage=seems
+3,pola nisa gedara thiyanawa,පොල නිසා ගෙදර තියෙනවා,train,0,1,general,nisa=because
+4,oya kiwwata mama giye,ඔයා කිව්වට මම ගියේ,train,0,0,general,contextual past
+5,mama danne na eka gena,මම දන්නෙ නෑ ඒක ගැන,train,0,1,general,eka pronoun
+6,oya awa wage na,ඔයා ආවා වගේ නෑ,train,0,1,general,wage=seems
+7,ekat ynna bri,ඒකට යන්න බැරි,train,0,0,general,ad hoc bri=bari
+8,mama inne gedaradi,මම ඉන්නෙ ගෙදරදී,train,0,0,general,pure singlish
+9,eka heta balamu,ඒක හෙට බලමු,train,0,0,general,eka pronoun
+10,klya madi api passe yamu,කාලය මදි අපි පස්සෙ යමු,train,0,0,general,ad hoc klya=kalaya
+11,assignment eka ada submit karanna one,assignment එක අද submit කරන්න ඕනෙ,train,1,0,education,eka after English noun
+12,exam hall eka nisa mama baya una,exam hall එක නිසා මම බය උනා,train,1,1,education,nisa=because
+13,results blnna one,results බලන්න ඕනෙ,train,1,0,education,ad hoc blnna=balanna
+14,study group ekak hadamu,study group එකක් හදමු,train,1,0,education,ekak after English noun
+15,viva ekta prepared wage na,viva එකට prepared වගේ නෑ,train,1,1,education,wage=seems
+16,mta project ek submit krnna one,මට project එක submit කරන්න ඕනෙ,train,1,0,education,ad hoc mta krnna
+17,hta parikshanaya thiyanawa,හෙට පරික්‍ෂණය තියෙනවා,train,0,0,education,ad hoc hta=heta
+18,mama poth kiyawala iwara kala,මම පොත කියවලා ඉවර කළා,train,0,0,education,pure singlish
+19,guruwaraya nisa api kalin giya,ගුරුවරයා නිසා අපි කලින් ගියා,train,0,1,education,nisa=because
+20,prashnaya honda wage penenawa,ප්‍රශ්නය හොඳ වගේ පේනවා,train,0,1,education,wage=seems
+21,deploy nisa site down wuna,deploy නිසා site down උනා,train,1,1,work,nisa=because
+22,PR eka merge karanna one,PR එක merge කරන්න ඕනෙ,train,1,0,work,eka after English noun
+23,backlog eka update kala,backlog එක update කළා,train,1,0,work,eka after English noun
+24,server down nisa work karanna ba,server down නිසා work කරන්න බෑ,train,1,1,work,nisa=because
+25,meeting eka tomorrow damu,meeting එක tomorrow දාමු,train,1,0,work,code mix preserved
+26,feedback nisa redo karanna una,feedback නිසා redo කරන්න උනා,train,1,1,work,nisa=because
+27,ape wada ada iwara wenawa,අපේ වැඩ අද ඉවර වෙනවා,train,0,0,work,pure singlish
+28,kalamanakaru apu nisa api katha kala,කලමණාකරු ආපු නිසා අපි කතා කලා,train,0,1,work,nisa=because
+29,me wada honda wage penenawa,මේ වැඩ හොඳ වගේ පේනවා,train,0,1,work,wage=seems
+30,wada tika ada iwara karamu,වැඩ ටික අද ඉවර කරමු,train,0,0,work,pure singlish
+31,story eke poll ekak damma,story එකේ poll එකක් දැම්මා,train,1,0,social,eke and ekak forms
+32,oyata DM ekak yewwa,ඔයාට DM එකක් යැව්වා,train,1,0,social,ekak after English noun
+33,comment eka delete kala nisa mama danne na,comment එක delete කල නිසා මම දන්නේ නෑ,train,1,1,social,nisa=because
+34,selfie ekak gannako,selfie එකක් ගන්නකෝ,train,1,0,social,ekak after English noun
+35,post eka private nisa share karanna epa,post එක private නිසා share කරන්න එපා,train,1,1,social,nisa=because
+36,oyta message krnna on,ඔයාට message කරන්න ඕනෙ,train,1,0,social,ad hoc oyta krnna
+37,oya passe katha karamu,ඔයා පස්සෙ කතා කරමු,train,0,0,social,pure singlish
+38,eya laga pinthurayk thiyanawa,ඒයා ළඟ පින්තූරයක් තියෙනවා,train,0,0,social,ad hoc pinthurayk
+39,oya awa wage mata hithenawa,ඔයා ආවා වගේ මට හිතෙනවා,train,0,1,social,wage=seems
+40,api passe hambawemu,අපි පස්සෙ හම්බවෙමු,train,0,0,social,pure singlish

misc/evaluate.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+SinCode v3 — Evaluation Script
+Supports two evaluation modes selected via --mode:
+  system       Full v3 pipeline (ByT5 + two-pass MLM).  Default.
+  ablation     Side-by-side comparison of two configurations:
+                 (A) ByT5 top-1 only  — no MLM reranking
+                 (B) ByT5 + MLM       — full Code-Mixed pipeline
+               Proves the contribution of the XLM-RoBERTa reranker.
+Note: mBart50 is intentionally excluded from evaluation here because the
+reference dataset uses code-mixed targets (English words preserved).  mBart50
+produces full-Sinhala output by design, making a metric comparison against
+code-mixed references invalid.  Evaluate mBart50 separately with a dataset
+whose references are fully in Sinhala script.
+Usage:
+    python misc/evaluate.py --dataset misc/dataset_110.csv
+    python misc/evaluate.py --dataset misc/dataset_110.csv --mode ablation
+    python misc/evaluate.py --dataset misc/dataset_110.csv --mode ablation --out misc/results.csv
+CSV columns required: id, input, reference
+Optional columns (used for grouping): category, domain, has_code_mix, has_ambiguity
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import logging
+import math
+import os
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+# ── Path setup ────────────────────────────────────────────────────────────────
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if ROOT not in sys.path:
+    sys.path.insert(0, ROOT)
+logging.basicConfig(level=logging.WARNING)
+# ── Metrics ───────────────────────────────────────────────────────────────────
+def _levenshtein(a: str, b: str) -> int:
+    if not a: return len(b)
+    if not b: return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i] + [0] * len(b)
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr[j] = min(prev[j] + 1, curr[j-1] + 1, prev[j-1] + cost)
+        prev = curr
+    return prev[-1]
+def _levenshtein_tokens(a: list, b: list) -> int:
+    if not a: return len(b)
+    if not b: return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ta in enumerate(a, 1):
+        curr = [i] + [0] * len(b)
+        for j, tb in enumerate(b, 1):
+            cost = 0 if ta == tb else 1
+            curr[j] = min(prev[j] + 1, curr[j-1] + 1, prev[j-1] + cost)
+        prev = curr
+    return prev[-1]
+def cer(pred: str, ref: str) -> float:
+    if not ref: return 0.0 if not pred else 1.0
+    return _levenshtein(pred, ref) / max(len(ref), 1)
+def wer(pred: str, ref: str) -> float:
+    pt, rt = pred.split(), ref.split()
+    if not rt: return 0.0 if not pt else 1.0
+    return _levenshtein_tokens(pt, rt) / max(len(rt), 1)
+def token_accuracy(pred: str, ref: str) -> float:
+    pt, rt = pred.split(), ref.split()
+    if not rt: return 0.0 if pt else 1.0
+    return sum(p == r for p, r in zip(pt, rt)) / max(len(rt), 1)
+def bleu(pred: str, ref: str, max_n: int = 4) -> float:
+    from collections import Counter
+    pt, rt = pred.split(), ref.split()
+    if not pt or not rt: return 0.0
+    n_max = min(max_n, len(pt), len(rt))
+    if n_max == 0: return 0.0
+    brevity = min(1.0, len(pt) / len(rt))
+    log_avg = 0.0
+    for n in range(1, n_max + 1):
+        pc = Counter(tuple(pt[i:i+n]) for i in range(len(pt)-n+1))
+        rc = Counter(tuple(rt[i:i+n]) for i in range(len(rt)-n+1))
+        clipped = sum(min(c, rc[ng]) for ng, c in pc.items())
+        total = max(sum(pc.values()), 1)
+        prec = clipped / total
+        if prec == 0: return 0.0
+        log_avg += math.log(prec) / n_max
+    return brevity * math.exp(log_avg)
+def exact_match(pred: str, ref: str) -> float:
+    return 1.0 if pred.strip() == ref.strip() else 0.0
+# ── Data model ────────────────────────────────────────────────────────────────
+@dataclass
+class TestCase:
+    id: int
+    input: str
+    reference: str
+    domain: str = "general"
+    has_code_mix: bool = False
+    has_ambiguity: bool = False
+@dataclass
+class Result:
+    test_case: TestCase
+    system: str
+    prediction: str
+    cer_score: float
+    wer_score: float
+    token_acc: float
+    bleu_score: float
+    exact: float
+def _score(tc: TestCase, pred: str, system: str) -> Result:
+    return Result(
+        test_case=tc,
+        system=system,
+        prediction=pred,
+        cer_score=cer(pred, tc.reference),
+        wer_score=wer(pred, tc.reference),
+        token_acc=token_accuracy(pred, tc.reference),
+        bleu_score=bleu(pred, tc.reference),
+        exact=exact_match(pred, tc.reference),
+    )
+# ── Test set loader ───────────────────────────────────────────────────────────
+def load_dataset(csv_path: str) -> List[TestCase]:
+    cases = []
+    with open(csv_path, "r", encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        fields = set(reader.fieldnames or [])
+        if not {"input", "reference"}.issubset(fields):
+            raise ValueError(f"CSV must have 'input' and 'reference' columns. Found: {fields}")
+        for row in reader:
+            inp = (row.get("input") or "").strip().replace("\n", " ")
+            ref = (row.get("reference") or "").strip().replace("\n", " ")
+            if not inp or not ref:
+                continue
+            cases.append(TestCase(
+                id=int(row.get("id") or 0),
+                input=inp,
+                reference=ref,
+                domain=(row.get("domain") or row.get("category") or "general").strip(),
+                has_code_mix=bool(int(row.get("has_code_mix") or 0)),
+                has_ambiguity=bool(int(row.get("has_ambiguity") or 0)),
+            ))
+    return cases
+# ── Model loaders ─────────────────────────────────────────────────────────────
+def _load_v3_decoder():
+    from sincode_model import BeamSearchDecoder
+    print("  Loading ByT5 + XLM-RoBERTa (Code-Mixed pipeline)...")
+    return BeamSearchDecoder()
+def _byt5_top1_predict(decoder, sentence: str) -> str:
+    """ByT5 top-1 only — pick first beam candidate, skip MLM reranking."""
+    from core.constants import PUNCT_PATTERN
+    from core.decoder import _is_sinhala
+    words = sentence.split()
+    output = []
+    cores = [re.sub(r"^\W*|\W*$", "", w) for w in words]
+    non_sinhala = [c for c in cores if not _is_sinhala(c) and c]
+    if not non_sinhala:
+        return sentence
+    byt5_results = decoder.transliterator.batch_candidates(non_sinhala, k=1)
+    byt5_iter = iter(byt5_results)
+    for raw, core in zip(words, cores):
+        m = PUNCT_PATTERN.match(raw)
+        prefix, _, suffix = m.groups() if m else ("", raw, "")
+        if _is_sinhala(core) or not core:
+            output.append(raw)
+        else:
+            cands = next(byt5_iter, [core])
+            output.append(prefix + (cands[0] if cands else core) + suffix)
+    return " ".join(output)
+# ── Reporting ─────────────────────────────────────────────────────────────────
+def _avg(vals: List[float]) -> float:
+    return sum(vals) / len(vals) if vals else 0.0
+def _print_table(label: str, results: List[Result]):
+    print(f"\n{'='*74}")
+    print(f"  {label}  (n={len(results)})")
+    print(f"{'='*74}")
+    print(f"  {'ID':<5} {'Domain':<14} {'CM':>3} {'Am':>3}  {'CER':>6} {'WER':>6} {'TokAcc':>7} {'BLEU':>6} {'EM':>4}")
+    print(f"  {'-'*66}")
+    for r in results:
+        tc = r.test_case
+        print(
+            f"  {tc.id:<5} {tc.domain[:13]:<14} {'Y' if tc.has_code_mix else 'N':>3} "
+            f"{'Y' if tc.has_ambiguity else 'N':>3}  "
+            f"{r.cer_score:>6.3f} {r.wer_score:>6.3f} {r.token_acc:>7.3f} "
+            f"{r.bleu_score:>6.3f} {r.exact:>4.0f}"
+        )
+    print(f"  {'-'*66}")
+    print(
+        f"  {'AVERAGE':<26}  "
+        f"{_avg([r.cer_score for r in results]):>6.3f} "
+        f"{_avg([r.wer_score for r in results]):>6.3f} "
+        f"{_avg([r.token_acc for r in results]):>7.3f} "
+        f"{_avg([r.bleu_score for r in results]):>6.3f} "
+        f"{_avg([r.exact for r in results]):>4.2f}"
+    )
+    # Per-domain breakdown
+    by_domain: Dict[str, List[Result]] = defaultdict(list)
+    for r in results:
+        by_domain[r.test_case.domain].append(r)
+    if len(by_domain) > 1:
+        print(f"\n  Per-domain averages (CER / WER / TokAcc):")
+        for dom, rs in sorted(by_domain.items()):
+            print(
+                f"    {dom:<18}  n={len(rs):<4} "
+                f"CER={_avg([r.cer_score for r in rs]):.3f}  "
+                f"WER={_avg([r.wer_score for r in rs]):.3f}  "
+                f"TokAcc={_avg([r.token_acc for r in rs]):.3f}"
+            )
+    # Code-mixed vs pure Singlish
+    cm_r  = [r for r in results if r.test_case.has_code_mix]
+    pure_r = [r for r in results if not r.test_case.has_code_mix]
+    if cm_r and pure_r:
+        print(
+            f"\n  Code-mixed  (n={len(cm_r):<3}):  "
+            f"CER={_avg([r.cer_score for r in cm_r]):.3f}  "
+            f"WER={_avg([r.wer_score for r in cm_r]):.3f}"
+        )
+        print(
+            f"  Pure Singlish (n={len(pure_r):<3}):  "
+            f"CER={_avg([r.cer_score for r in pure_r]):.3f}  "
+            f"WER={_avg([r.wer_score for r in pure_r]):.3f}"
+        )
+def _print_ablation(a_res: List[Result], b_res: List[Result]):
+    print(f"\n{'='*74}")
+    print("  ABLATION STUDY — MLM Reranking Contribution")
+    print(f"  (A) ByT5 top-1 only  |  (B) ByT5 + XLM-RoBERTa MLM reranking")
+    print(f"{'='*74}")
+    print(f"  {'Metric':<22}  {'(A) ByT5-top1':>14}  {'(B) ByT5+MLM':>13}  {'Δ (B−A)':>10}")
+    print(f"  {'-'*64}")
+    metrics = [
+        ("CER (↓ better)",  [r.cer_score  for r in a_res], [r.cer_score  for r in b_res], True),
+        ("WER (↓ better)",  [r.wer_score  for r in a_res], [r.wer_score  for r in b_res], True),
+        ("Token Acc (↑)",   [r.token_acc  for r in a_res], [r.token_acc  for r in b_res], False),
+        ("BLEU (↑ better)", [r.bleu_score for r in a_res], [r.bleu_score for r in b_res], False),
+        ("Exact Match (↑)", [r.exact      for r in a_res], [r.exact      for r in b_res], False),
+    ]
+    for label, a_vals, b_vals, lower_is_better in metrics:
+        a_avg, b_avg = _avg(a_vals), _avg(b_vals)
+        delta = b_avg - a_avg
+        improved = (delta < 0) if lower_is_better else (delta > 0)
+        print(
+            f"  {label:<22}  {a_avg:>14.4f}  {b_avg:>13.4f}  "
+            f"  {'✓' if improved else '✗'}{delta:>+8.4f}"
+        )
+    print(f"\n  ✓ B vs A isolates the contribution of XLM-RoBERTa MLM reranking.")
+    print(f"  ✓ If B > A: the two-pass reranker justifies its computational cost.")
+    # Subcategory breakdown
+    for sublabel, filter_fn in [
+        ("Code-mixed only",   lambda r: r.test_case.has_code_mix),
+        ("Ambiguous only",    lambda r: r.test_case.has_ambiguity),
+        ("Pure Singlish",     lambda r: not r.test_case.has_code_mix),
+    ]:
+        a_sub = [r for r in a_res if filter_fn(r)]
+        b_sub = [r for r in b_res if filter_fn(r)]
+        if not a_sub:
+            continue
+        print(f"\n  {sublabel} (n={len(a_sub)}):")
+        print(f"    {'':20}  {'(A)':>10}  {'(B)':>10}  {'Δ':>10}")
+        for ml, getter, low in [("CER", lambda r: r.cer_score, True), ("WER", lambda r: r.wer_score, True), ("TokAcc", lambda r: r.token_acc, False)]:
+            av, bv = _avg([getter(r) for r in a_sub]), _avg([getter(r) for r in b_sub])
+            d = bv - av
+            imp = (d < 0) if low else (d > 0)
+            print(
+                f"    {ml:<20}  {av:>10.4f}  {bv:>10.4f}  "
+                f"  {'✓' if imp else '✗'}{d:>+7.4f}"
+            )
+def _load_baseline(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _print_v2_comparison(b_res: List[Result], baseline: dict):
+    n = len(b_res)
+    v3 = {
+        "exact_match": _avg([r.exact      for r in b_res]),
+        "cer":         _avg([r.cer_score  for r in b_res]),
+        "wer":         _avg([r.wer_score  for r in b_res]),
+        "bleu":        _avg([r.bleu_score for r in b_res]),
+        "token_acc":   _avg([r.token_acc  for r in b_res]),
+    }
+    v2_label = baseline.get("system", "v2 baseline")
+    print(f"\n{'='*74}")
+    print(f"  SinCode v2  vs  SinCode v3  —  Head-to-Head  (n={n})")
+    print(f"  v2: {v2_label}")
+    print(f"  v3: ByT5-small seq2seq + XLM-RoBERTa MLM reranking")
+    print(f"{'='*74}")
+    print(f"  {'Metric':<22}  {'v2 (baseline)':>14}  {'v3 (ours)':>10}  {'Δ (v3−v2)':>12}")
+    print(f"  {'-'*62}")
+    metrics = [
+        ("Exact Match (↑)", "exact_match", False),
+        ("CER (↓ better)",  "cer",         True),
+        ("WER (↓ better)",  "wer",         True),
+        ("BLEU (↑ better)", "bleu",        False),
+        ("Token Acc (↑)",   "token_acc",   False),
+    ]
+    for label, key, lower_is_better in metrics:
+        v2v = baseline.get(key, 0.0)
+        v3v = v3[key]
+        delta = v3v - v2v
+        improved = (delta < 0) if lower_is_better else (delta > 0)
+        arrow = "↑" if (delta > 0) else ("↓" if delta < 0 else "=")
+        print(
+            f"  {label:<22}  {v2v:>14.4f}  {v3v:>10.4f}  "
+            f"  {'✓' if improved else '✗'} {arrow}{abs(delta):>+8.4f}"
+        )
+    if baseline.get("notes"):
+        print(f"\n  Note: {baseline['notes']}")
+def _save_csv(results_by_system: Dict[str, List[Result]], out_path: str):
+    rows = []
+    for system, results in results_by_system.items():
+        for r in results:
+            rows.append({
+                "system": system,
+                "id": r.test_case.id,
+                "domain": r.test_case.domain,
+                "has_code_mix": int(r.test_case.has_code_mix),
+                "has_ambiguity": int(r.test_case.has_ambiguity),
+                "input": r.test_case.input,
+                "reference": r.test_case.reference,
+                "prediction": r.prediction,
+                "cer": f"{r.cer_score:.4f}",
+                "wer": f"{r.wer_score:.4f}",
+                "token_acc": f"{r.token_acc:.4f}",
+                "bleu": f"{r.bleu_score:.4f}",
+                "exact_match": f"{r.exact:.0f}",
+            })
+    with open(out_path, "w", encoding="utf-8", newline="") as f:
+        w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
+        w.writeheader()
+        w.writerows(rows)
+    print(f"\n  Results saved -> {out_path}")
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(description="SinCode v3 evaluation")
+    parser.add_argument("--dataset", required=True,
+                        help="Path to evaluation CSV (dataset_110.csv or dataset_40.csv)")
+    parser.add_argument("--mode", default="system",
+                        choices=["system", "ablation"],
+                        help="Evaluation mode (default: system)")
+    parser.add_argument("--out", default=None,
+                        help="Optional path to save results CSV")
+    parser.add_argument("--baseline", default=None,
+                        help="Path to v2 baseline JSON (e.g. misc/v2_baseline.json) for head-to-head comparison")
+    args = parser.parse_args()
+    print(f"\nLoading dataset: {args.dataset}")
+    test_cases = load_dataset(args.dataset)
+    print(f"  {len(test_cases)} test cases loaded.")
+    results_by_system: Dict[str, List[Result]] = {}
+    a_results: List[Result] = []
+    b_results: List[Result] = []
+    decoder = _load_v3_decoder()
+    if args.mode == "ablation":
+        print("\nRunning (A) ByT5 top-1 only...")
+        a_results = [_score(tc, _byt5_top1_predict(decoder, tc.input), "byt5_top1") for tc in test_cases]
+        results_by_system["byt5_top1"] = a_results
+    print("\nRunning (B) ByT5 + MLM reranking...")
+    b_results = [_score(tc, decoder.decode(tc.input)[0], "byt5_mlm") for tc in test_cases]
+    results_by_system["byt5_mlm"] = b_results
+    if args.mode == "system":
+        _print_table("v3 Code-Mixed Pipeline  (ByT5 + XLM-RoBERTa MLM)", b_results)
+    elif args.mode == "ablation":
+        _print_table("(A) ByT5 top-1 only", a_results)
+        _print_table("(B) ByT5 + MLM reranking", b_results)
+        _print_ablation(a_results, b_results)
+    if args.baseline:
+        baseline = _load_baseline(args.baseline)
+        _print_v2_comparison(b_results, baseline)
+    if args.out:
+        _save_csv(results_by_system, args.out)
+if __name__ == "__main__":
+    main()

misc/upload_mlm_to_hf.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Upload the fine-tuned XLM-RoBERTa MLM model to HuggingFace Hub.
+Run from: C:\Y5_Docs\FYP\SinCode\SinCode_v3
+Usage: python misc/upload_mlm_to_hf.py --token YOUR_HF_WRITE_TOKEN
+"""
+import argparse
+from pathlib import Path
+from huggingface_hub import HfApi
+MODEL_LOCAL_PATH = Path(
+    r"C:\Y5_Docs\FYP\SinCode\SinCode_v2-20260315T161648Z-1-001"
+    r"\SinCode_v2\SinCode\SinCode\xlm-roberta-sinhala-v5-strict-full\final"
+)
+REPO_ID = "Kalana001/xlm-roberta-base-finetuned-sinhala"
+FILES_TO_UPLOAD = [
+    "config.json",
+    "model.safetensors",
+    "tokenizer.json",
+    "tokenizer_config.json",
+]
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--token", required=True, help="HuggingFace write-access token")
+    args = parser.parse_args()
+    api = HfApi(token=args.token)
+    print(f"Uploading to: {REPO_ID}")
+    for filename in FILES_TO_UPLOAD:
+        local_file = MODEL_LOCAL_PATH / filename
+        if not local_file.exists():
+            print(f"  SKIP (not found): {filename}")
+            continue
+        size_mb = round(local_file.stat().st_size / 1024 / 1024, 1)
+        print(f"  Uploading {filename} ({size_mb} MB)...")
+        api.upload_file(
+            path_or_fileobj=str(local_file),
+            path_in_repo=filename,
+            repo_id=REPO_ID,
+            repo_type="model",
+        )
+        print(f"  Done: {filename}")
+    print(f"\nAll files uploaded to https://huggingface.co/{REPO_ID}")
+if __name__ == "__main__":
+    main()

misc/v2_baseline.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "system": "SinCode v2 (rule-based + dictionary)",
+  "samples": 110,
+  "exact_match": 0.8364,
+  "cer": 0.0122,
+  "wer": 0.0407,
+  "bleu": 0.8861,
+  "token_acc": 0.9593,
+  "notes": "Measured on dataset_110.csv test split. Avg time per sentence: 0.03s (3.34s total)."
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers>=4.40.0
+torch>=2.2.0
+sentencepiece
+datasets
+streamlit
+pandas

seq2seq/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # seq2seq — ByT5 and mBart50 inference wrappers

seq2seq/compose_fix_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+    "ක්\\sර": "ක්‍ර",
+    "ක්\\sරි": "ක්‍රි",
+    "ක්\\sරී": "ක්‍රී",
+    "ක්\\sරා": "ක්‍රා",
+    "ද්\\sර": "ද්‍ර",
+    "ද්\\sරි": "ද්‍රි",
+    "ද්\\sරී": "ද්‍රී",
+    "ද්\\sරා": "ද්‍රා",
+    "ට්\\sර": "ට්‍ර",
+    "ට්\\sරි": "ට්‍රි",
+    "ට්\\sරී": "ට්‍රී",
+    "ට්\\sරා": "ට්‍රා",
+    "ත්\\sර": "ත්‍ර",
+    "ත්\\sරි": "ත්‍රි",
+    "ත්\\sරී": "ත්‍රී",
+    "ත්\\sරා": "ත්‍රා",
+    "ප්\\sර": "ප්‍ර",
+    "ප්\\sරි": "ප්‍රි",
+    "ප්\\sරී": "ප්‍රී",
+    "ප්\\sරා": "ප්‍රා",
+    "බ්\\sර": "බ්‍ර",
+    "බ්\\sරි": "බ්‍රි",
+    "බ්\\sරී": "බ්‍රී",
+    "බ්\\sරා": "බ්‍රා",
+    "ග්\\sර": "ග්‍ර",
+    "ග්\\sරි": "ග්‍රි",
+    "ග්\\sරී": "ග්‍රී",
+    "ග්\\sරා": "ග්‍රා",
+    "ෂ්\\sර": "ෂ්‍ර",
+    "ෂ්\\sරි": "ෂ්‍රි",
+    "ෂ්\\sරී": "ෂ්‍රී",
+    "ෂ්\\sරා": "ෂ්‍රා",
+    "ශ්\\sර": "ශ්‍ර",
+    "ශ්\\sරි": "ශ්‍රි",
+    "ශ්\\sරී": "ශ්‍රී",
+    "ශ්\\sරා": "ශ්‍රා",
+    "ව්\\sය": "ව්‍ය",
+    "ව්\\sයා": "ව්‍යා",
+    "ද්\\sය": "ද්‍ය",
+    "ද්\\sයා": "ද්‍යා",
+    "න්\\sය": "න්‍ය",
+    "ධ්\\sය": "ධ්‍ය",
+    "ධ්\\sයා": "ධ්‍යා",
+    "ද්\\sයු": "ද්‍යු"
+}

seq2seq/finetune_corrections.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+seq2seq/finetune_corrections.py
+Targeted correction fine-tune for the already-trained ByT5 model.
+Problem: ByT5 struggles with short/ambiguous tokens like "na"→නෑ, "ba"→බෑ,
+         extreme abbreviations like "mn"→මං, and colloquial negations.
+Solution: Inject high-confidence correction pairs (from core/mappings.py)
+          heavily repeated, mixed with a random sample of the original
+          training data to prevent catastrophic forgetting.
+The output is saved to byt5-singlish-sinhala/final/ (overwrites in place).
+Run from the project root:
+    python seq2seq/finetune_corrections.py
+"""
+from __future__ import annotations
+import random
+import sys
+from pathlib import Path
+ROOT = Path(__file__).parent.parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+import torch
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    default_data_collator,
+)
+# ── Config ────────────────────────────────────────────────────────────────────
+MODEL_PATH   = ROOT / "seq2seq" / "byt5-singlish-sinhala" / "final"
+DATA_PATH    = ROOT / "seq2seq" / "wsd_pairs.csv"
+OUTPUT_DIR   = ROOT / "seq2seq" / "byt5-singlish-sinhala" / "final"   # overwrite in place
+REPEAT       = 500       # how many times each correction pair is repeated
+BG_SAMPLES   = 50_000    # random background pairs from wsd_pairs.csv to prevent forgetting
+MAX_INPUT_LEN  = 64
+MAX_TARGET_LEN = 64
+BATCH_SIZE   = 32
+LR           = 5e-5      # low LR — gentle correction, not retraining
+EPOCHS       = 1
+SEED         = 42
+# ── Correction pairs (sourced from core/mappings.py) ─────────────────────────
+# Only include pairs where ByT5 is known to be unreliable.
+# English-safe tokens (pr, dm, ai…) are excluded — they never reach ByT5.
+CORRECTIONS = [
+    # negation — most critical
+    ("na",          "නෑ"),
+    ("naa",         "නෑ"),
+    ("ba",          "බෑ"),
+    ("bari",        "බැරි"),
+    ("bri",         "බැරි"),
+    ("nathi",       "නැති"),
+    ("nati",        "නැති"),
+    ("naththe",     "නැත්තෙ"),
+    ("epa",         "එපා"),
+    ("ep",          "එපා"),
+    # pronouns / first person
+    ("mn",          "මං"),
+    ("mama",        "මම"),
+    ("mage",        "මගේ"),
+    ("mge",         "මගේ"),
+    ("oya",         "ඔයා"),
+    ("oyaa",        "ඔයා"),
+    ("api",         "අපි"),
+    ("mata",        "මට"),
+    ("mta",         "මට"),
+    ("oyata",       "ඔයාට"),
+    ("oyta",        "ඔයාට"),
+    ("oyage",       "ඔයාගේ"),
+    ("oyge",        "ඔයාගෙ"),
+    ("ape",         "අපේ"),
+    # common particles
+    ("one",         "ඕනෙ"),
+    ("oney",        "ඕනේ"),
+    ("on",          "ඕනෙ"),
+    ("oni",         "ඕනි"),
+    ("hari",        "හරි"),
+    ("hri",         "හරි"),
+    ("wage",        "වගේ"),
+    ("nisa",        "නිසා"),
+    ("dan",         "දැන්"),
+    ("gena",        "ගැන"),
+    # time
+    ("heta",        "හෙට"),
+    ("hta",         "හෙට"),
+    ("ada",         "අද"),
+    ("iye",         "ඊයේ"),
+    ("kalin",       "කලින්"),
+    ("passe",       "පස්සෙ"),
+    # abbreviations
+    ("mn",          "මං"),
+    ("ek",          "එක"),
+    ("ekta",        "එකට"),
+    ("eke",         "එකේ"),
+    ("me",          "මේ"),
+    # common words
+    ("honda",       "හොඳ"),
+    ("hodai",       "හොඳයි"),
+    ("gedara",      "ගෙදර"),
+    ("wada",        "වැඩ"),
+    ("kema",        "කෑම"),
+    ("kama",        "කෑම"),
+    ("inne",        "ඉන්නෙ"),
+    ("inna",        "ඉන්න"),
+    ("madi",        "මදි"),
+    ("iwara",       "ඉවර"),
+    ("iwra",        "ඉවර"),
+    # verbal
+    ("awa",         "ආවා"),
+    ("aawa",        "ආවා"),
+    ("giya",        "ගියා"),
+    ("una",         "උනා"),
+    ("wuna",        "උනා"),
+    ("kiwa",        "කිව්වා"),
+    ("kiwwa",       "කිව්වා"),
+    ("yewwa",       "යැව්වා"),
+    ("yawwa",       "යැව්වා"),
+    ("damma",       "දැම්මා"),
+    ("karanna",     "කරන්න"),
+    ("krnna",       "කරන්න"),
+    ("balanna",     "බලන්න"),
+    ("blnna",       "බලන්න"),
+    ("hadanna",     "හදන්න"),
+    ("karamu",      "කරමු"),
+    ("balamu",      "බලමු"),
+    ("yamu",        "යමු"),
+    ("hadamu",      "හදමු"),
+    ("damu",        "දාමු"),
+    ("wenawa",      "වෙනව��"),
+    ("wenwa",       "වෙනවා"),
+    ("thiyanawa",   "තියෙනවා"),
+    ("enawa",       "එනවා"),
+    ("yanawa",      "යනවා"),
+]
+# ── Dataset builder ───────────────────────────────────────────────────────────
+def build_dataset(tokenizer) -> Dataset:
+    import csv
+    pairs: list[dict] = []
+    # 1. Correction pairs repeated REPEAT times
+    for romanized, sinhala in CORRECTIONS:
+        for _ in range(REPEAT):
+            pairs.append({"romanized": romanized, "sinhala": sinhala})
+    correction_count = len(pairs)
+    print(f"  Correction pairs: {len(CORRECTIONS)} × {REPEAT} = {correction_count:,}")
+    # 2. Background sample from original training data
+    bg: list[dict] = []
+    with open(DATA_PATH, encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            r = (row.get("romanized") or "").strip()
+            s = (row.get("sinhala") or "").strip()
+            if r and s:
+                bg.append({"romanized": r, "sinhala": s})
+    random.seed(SEED)
+    random.shuffle(bg)
+    bg = bg[:BG_SAMPLES]
+    pairs.extend(bg)
+    print(f"  Background pairs: {len(bg):,}")
+    print(f"  Total dataset   : {len(pairs):,}")
+    random.shuffle(pairs)
+    ds = Dataset.from_list(pairs)
+    def tokenize(batch):
+        inputs = tokenizer(
+            batch["romanized"],
+            max_length=MAX_INPUT_LEN,
+            truncation=True,
+            padding="max_length",
+        )
+        targets = tokenizer(
+            batch["sinhala"],
+            max_length=MAX_TARGET_LEN,
+            truncation=True,
+            padding="max_length",
+        )
+        inputs["labels"] = [
+            [(t if t != tokenizer.pad_token_id else -100) for t in ids]
+            for ids in targets["input_ids"]
+        ]
+        return inputs
+    ds = ds.map(tokenize, batched=True, batch_size=5_000,
+                remove_columns=["romanized", "sinhala"], desc="Tokenizing")
+    ds.set_format("torch")
+    return ds
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"\nDevice : {device}")
+    if device == "cpu":
+        print("WARNING: running on CPU — this will take ~30-60 min.")
+    print(f"Loading model from {MODEL_PATH} ...")
+    tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH))
+    model     = AutoModelForSeq2SeqLM.from_pretrained(str(MODEL_PATH))
+    print("\nBuilding correction dataset ...")
+    ds = build_dataset(tokenizer)
+    split    = ds.train_test_split(test_size=0.02, seed=SEED)
+    train_ds = split["train"]
+    eval_ds  = split["test"]
+    print(f"  train={len(train_ds):,}  eval={len(eval_ds):,}")
+    warmup = max(100, len(train_ds) // (BATCH_SIZE * 20))
+    args = Seq2SeqTrainingArguments(
+        output_dir=str(OUTPUT_DIR),
+        num_train_epochs=EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=LR,
+        warmup_steps=warmup,
+        weight_decay=0.01,
+        predict_with_generate=False,  # faster eval — we only care about loss
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        logging_steps=100,
+        dataloader_num_workers=0,
+        seed=SEED,
+        bf16=torch.cuda.is_bf16_supported(),
+        fp16=not torch.cuda.is_bf16_supported() and torch.cuda.is_available(),
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        data_collator=default_data_collator,
+    )
+    print("\nStarting correction fine-tune ...")
+    trainer.train()
+    print(f"\nSaving corrected model to {OUTPUT_DIR} ...")
+    model.save_pretrained(str(OUTPUT_DIR))
+    tokenizer.save_pretrained(str(OUTPUT_DIR))
+    print("Done.")
+if __name__ == "__main__":
+    main()

seq2seq/infer.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Inference helper — given a romanized word, return top-K Sinhala candidates
+using beam search on the fine-tuned ByT5 model.
+Usage:
+    from seq2seq.infer import Transliterator
+    t = Transliterator()
+    print(t.candidates("videowe", k=5))
+    # ['වීඩියොවේ', 'වීඩියොවී', 'වීඩියොව', ...]
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+import torch
+from transformers import ByT5Tokenizer, T5ForConditionalGeneration
+DEFAULT_MODEL_PATH = Path(__file__).parent / "byt5-singlish-sinhala" / "final"
+class Transliterator:
+    def __init__(self, model_path: str | Path = DEFAULT_MODEL_PATH, device: Optional[str] = None):
+        # Keep as string — Path() would convert '/' to '\' on Windows, breaking HF Hub IDs
+        model_path = str(model_path)
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = ByT5Tokenizer.from_pretrained(model_path)
+        self.model = T5ForConditionalGeneration.from_pretrained(model_path)
+        self.model.to(self.device)
+        self.model.eval()
+    def candidates(self, word: str, k: int = 5) -> list[str]:
+        """Return top-k Sinhala transliteration candidates for a single word."""
+        return self.batch_candidates([word], k=k)[0]
+    def batch_candidates(self, words: list[str], k: int = 5) -> list[list[str]]:
+        """
+        Return top-k Sinhala candidates for each word in a single forward pass.
+        Much faster than calling candidates() per word on a long sentence.
+        """
+        lowered = [w.lower() for w in words]
+        inputs = self.tokenizer(
+            lowered,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=64,
+        ).to(self.device)
+        n = len(words)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                num_beams=max(k, 5),
+                num_return_sequences=k,
+                max_new_tokens=64,
+                early_stopping=True,
+            )
+        # outputs shape: (n * k, seq_len) — k sequences per input, grouped
+        results: list[list[str]] = []
+        for i in range(n):
+            seen: set[str] = set()
+            cands: list[str] = []
+            for seq in outputs[i * k : (i + 1) * k]:
+                text = self.tokenizer.decode(seq, skip_special_tokens=True).strip()
+                if text and text not in seen:
+                    seen.add(text)
+                    cands.append(text)
+            results.append(cands)
+        return results
+if __name__ == "__main__":
+    import sys
+    words = sys.argv[1:] if len(sys.argv) > 1 else ["wadi"]
+    t = Transliterator()
+    for word in words:
+        print(f"Candidates for '{word}':")
+        for c in t.candidates(word):
+            print(f"  {c}")
+        print()

seq2seq/mbart_infer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+mBart50-based Sentence Transliterator for SinCode v3.
+Full-sentence Singlish → Sinhala transliteration.
+Unlike the ByT5 word-by-word pipeline, mBart50 operates on the whole input
+sentence and produces fully Sinhalized output — no English words are retained.
+Use-case: "mn heta business ekak start karanawa"
+       → "මන් හෙට ව්‍යාපාරයක් පටන් ගන්නවා"
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Optional
+import torch
+from transformers import MBart50Tokenizer, MBartForConditionalGeneration
+from core.constants import DEFAULT_MBART_MODEL
+logger = logging.getLogger(__name__)
+# ── Fix-map (ZWJ / Virama composition) ───────────────────────────────────────
+_FIX_MAP_PATH = Path(__file__).parent / "compose_fix_map.json"
+_fix_map_cache: dict[str, str] | None = None
+def _load_fix_map() -> dict[str, str]:
+    global _fix_map_cache
+    if _fix_map_cache is None:
+        with open(_FIX_MAP_PATH, "r", encoding="utf-8") as f:
+            _fix_map_cache = json.load(f)
+    return _fix_map_cache
+# ── Input cleaning ────────────────────────────────────────────────────────────
+# Scripts that are not Sinhala, Latin, numbers, or symbols — filtered out
+_UNSUPPORTED_SCRIPT = re.compile(
+    r"[\u0B80-\u0BFF"   # Tamil
+    r"\u0900-\u097F"    # Devanagari
+    r"\u4E00-\u9FFF"    # CJK Unified Ideographs
+    r"\u3040-\u309F"    # Hiragana
+    r"\u30A0-\u30FF"    # Katakana
+    r"\u0E00-\u0E7F"    # Thai
+    r"\u0600-\u06FF"    # Arabic
+    r"\u0590-\u05FF"    # Hebrew
+    r"\uAC00-\uD7AF]"   # Hangul
+)
+def _clean(text: str) -> str | None:
+    """Remove words in unsupported scripts; return None if nothing remains."""
+    words = text.strip().split()
+    filtered = [w for w in words if not _UNSUPPORTED_SCRIPT.search(w)]
+    return " ".join(filtered) if filtered else None
+def _apply_fixes(text: str) -> str:
+    """Apply ZWJ/virama composition fixes to mBart50 output."""
+    for pattern, replacement in _load_fix_map().items():
+        text = re.sub(pattern, replacement, text)
+    return text
+# ── Transliterator ────────────────────────────────────────────────────────────
+class SentenceTransliterator:
+    """
+    Full-sentence Singlish → Sinhala transliterator (mBart50).
+    Loads from Hugging Face Hub on first instantiation.
+    Thread-safe for inference (no mutable state after __init__).
+    """
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MBART_MODEL,
+        device: Optional[str] = None,
+    ):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Loading mBart50 transliterator: %s", model_name)
+        self.tokenizer = MBart50Tokenizer.from_pretrained(model_name)
+        self.model = MBartForConditionalGeneration.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+    def transliterate(self, text: str) -> str:
+        """
+        Transliterate a Singlish sentence to fully-Sinhalized output.
+        Args:
+            text: Input Singlish sentence (Romanized Sinhala / English mix).
+        Returns:
+            Sinhala-script output. Returns original text if input is empty
+            or consists entirely of unsupported-script characters.
+        """
+        cleaned = _clean(text)
+        if not cleaned:
+            return text
+        self.tokenizer.src_lang = "si_LK"
+        inputs = self.tokenizer(
+            cleaned,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=128,
+        ).to(self.device)
+        with torch.no_grad():
+            tokens = self.model.generate(
+                **inputs,
+                forced_bos_token_id=self.tokenizer.lang_code_to_id["si_LK"],
+            )
+        output = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
+        return _apply_fixes(output)

seq2seq/prepare_data.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Parse WSD.txt into a CSV training dataset for ByT5 fine-tuning.
+Input format (WSD.txt):
+    Word: <romanized>, Sinhala Words: ['<s1>', '<s2>', ...]
+Output (wsd_pairs.csv):
+    romanized,sinhala
+    wadi,වෑඩි
+    wadi,වාඩි
+    ...
+One row per (romanized, sinhala) pair. Duplicate sinhala entries per
+word are kept since ByT5 learns from all valid transliterations.
+"""
+import ast
+import csv
+import re
+import sys
+from pathlib import Path
+WSD_PATH   = Path(r"C:\Y5_Docs\FYP\WSD.txt")
+OUT_PATH   = Path(__file__).parent / "wsd_pairs.csv"
+LINE_RE = re.compile(r"^Word:\s*(.+?),\s*Sinhala Words:\s*(\[.+\])\s*$")
+MIN_ROMAN_LEN = 2   # skip single-char romanized entries
+MAX_ROMAN_LEN = 40  # skip obviously malformed long entries
+def parse_wsd(wsd_path: Path) -> list[tuple[str, str]]:
+    pairs: list[tuple[str, str]] = []
+    skipped = 0
+    with wsd_path.open(encoding="utf-8") as f:
+        for lineno, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            m = LINE_RE.match(line)
+            if not m:
+                skipped += 1
+                continue
+            roman = m.group(1).strip().lower()
+            if not (MIN_ROMAN_LEN <= len(roman) <= MAX_ROMAN_LEN):
+                skipped += 1
+                continue
+            try:
+                sinhala_list = ast.literal_eval(m.group(2))
+            except (ValueError, SyntaxError):
+                skipped += 1
+                continue
+            for sinhala in sinhala_list:
+                sinhala = sinhala.strip()
+                if sinhala:
+                    pairs.append((roman, sinhala))
+            if lineno % 100_000 == 0:
+                print(f"  processed {lineno:,} lines, {len(pairs):,} pairs so far…")
+    print(f"  skipped {skipped:,} malformed lines")
+    return pairs
+def write_csv(pairs: list[tuple[str, str]], out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["romanized", "sinhala"])
+        writer.writerows(pairs)
+def main() -> None:
+    print(f"Parsing {WSD_PATH} …")
+    pairs = parse_wsd(WSD_PATH)
+    print(f"\nTotal pairs: {len(pairs):,}")
+    print(f"Writing to {OUT_PATH} …")
+    write_csv(pairs, OUT_PATH)
+    print("Done.")
+    # Quick sanity check
+    print("\nSample rows:")
+    for roman, sinhala in pairs[:5]:
+        print(f"  {roman!r:20s} → {sinhala}")
+if __name__ == "__main__":
+    main()

seq2seq/train.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Fine-tune google/byt5-small on Singlish → Sinhala word-level transliteration.
+Input:  wsd_pairs.csv  (romanized, sinhala)
+Output: byt5-singlish-sinhala/  (HuggingFace model directory)
+Training approach:
+  - Input  : romanized word  (e.g. "wadi")
+  - Target : sinhala word    (e.g. "වැඩි")
+  - Model  : ByT5-small (byte-level T5, no vocab issues with any script)
+  - Beam=5 at inference → top-5 candidates for MLM reranking
+  Tokenized dataset is saved to disk after first run — restarts skip
+  straight to training without re-tokenizing.
+"""
+from pathlib import Path
+import torch
+from datasets import Dataset, load_from_disk
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    default_data_collator,
+)
+# ── Config ─────────────────────────────────────────────────────────────────
+BASE_MODEL      = "google/byt5-small"
+DATA_PATH       = Path(__file__).parent / "wsd_pairs.csv"
+CACHE_DIR       = Path(__file__).parent / "tokenized_cache"
+OUTPUT_DIR      = Path(__file__).parent / "byt5-singlish-sinhala"
+MAX_SAMPLES     = 1_000_000   # 1M pairs — more than enough for word transliteration
+TRAIN_SPLIT     = 0.97
+MAX_INPUT_LEN   = 64
+MAX_TARGET_LEN  = 64
+BATCH_SIZE      = 64    # 16GB VRAM — ByT5-small with seq_len=64
+EPOCHS          = 2
+LR              = 5e-4
+SEED            = 42
+# ── Tokenize ────────────────────────────────────────────────────────────────
+def tokenize_fn(batch, tokenizer):
+    # Pad to fixed max_length so all tensors have the same shape.
+    # This lets set_format("torch") work and default_data_collator just stacks.
+    model_inputs = tokenizer(
+        batch["romanized"],
+        max_length=MAX_INPUT_LEN,
+        truncation=True,
+        padding="max_length",
+    )
+    labels = tokenizer(
+        batch["sinhala"],
+        max_length=MAX_TARGET_LEN,
+        truncation=True,
+        padding="max_length",
+    )
+    # Replace pad token with -100 so it's ignored in cross-entropy loss
+    model_inputs["labels"] = [
+        [(t if t != tokenizer.pad_token_id else -100) for t in ids]
+        for ids in labels["input_ids"]
+    ]
+    return model_inputs
+# ── Main ───────────────────────────────────────────────────────────────────
+def main():
+    import os
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"  Device : {device}")
+    if device == "cuda":
+        print(f"  GPU    : {torch.cuda.get_device_name(0)}")
+        print(f"  VRAM   : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    else:
+        print("  WARNING: No GPU detected — training will be very slow!")
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    model     = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
+    train_cache = CACHE_DIR / "train"
+    eval_cache  = CACHE_DIR / "eval"
+    if train_cache.exists() and eval_cache.exists():
+        print("Loading pre-tokenized dataset from disk cache …")
+        train_ds = load_from_disk(str(train_cache))
+        eval_ds  = load_from_disk(str(eval_cache))
+        print(f"  train={len(train_ds):,}  eval={len(eval_ds):,}")
+    else:
+        print(f"Loading data from {DATA_PATH} …")
+        ds = Dataset.from_csv(str(DATA_PATH))
+        ds = ds.filter(lambda x: bool(x["romanized"]) and bool(x["sinhala"]))
+        print(f"  {len(ds):,} pairs — sampling {MAX_SAMPLES:,} …")
+        # Shuffle and take MAX_SAMPLES
+        ds = ds.shuffle(seed=SEED).select(range(min(MAX_SAMPLES, len(ds))))
+        split    = ds.train_test_split(test_size=1 - TRAIN_SPLIT, seed=SEED)
+        train_raw = split["train"]
+        eval_raw  = split["test"]
+        print(f"  train={len(train_raw):,}  eval={len(eval_raw):,}")
+        print("Tokenizing and saving to disk (one-time, ~5 min) …")
+        train_ds = train_raw.map(
+            lambda b: tokenize_fn(b, tokenizer),
+            batched=True,
+            batch_size=10_000,
+            num_proc=8,
+            keep_in_memory=True,
+            remove_columns=["romanized", "sinhala"],
+            desc="Tokenizing train",
+        )
+        eval_ds = eval_raw.map(
+            lambda b: tokenize_fn(b, tokenizer),
+            batched=True,
+            batch_size=10_000,
+            num_proc=8,
+            keep_in_memory=True,
+            remove_columns=["romanized", "sinhala"],
+            desc="Tokenizing eval",
+        )
+        CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        train_ds.save_to_disk(str(train_cache))
+        eval_ds.save_to_disk(str(eval_cache))
+        print("  Saved to disk. Future runs will load instantly.")
+    train_ds.set_format("torch")
+    eval_ds.set_format("torch")
+    # All sequences are pre-padded to fixed length — just stack them
+    collator     = default_data_collator
+    warmup_steps = int(0.05 * (len(train_ds) // BATCH_SIZE))
+    args = Seq2SeqTrainingArguments(
+        output_dir=str(OUTPUT_DIR),
+        num_train_epochs=EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=LR,
+        warmup_steps=warmup_steps,
+        weight_decay=0.01,
+        predict_with_generate=True,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        logging_steps=200,
+        dataloader_num_workers=0,   # 0 = main process only (most stable on Windows)
+        dataloader_pin_memory=True,
+        bf16=torch.cuda.is_bf16_supported(),
+        fp16=not torch.cuda.is_bf16_supported() and torch.cuda.is_available(),
+        seed=SEED,
+        report_to="none",
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        processing_class=tokenizer,
+        data_collator=collator,
+    )
+    print("Starting training …")
+    trainer.train()
+    print(f"Saving model to {OUTPUT_DIR}/final …")
+    model.save_pretrained(OUTPUT_DIR / "final")
+    tokenizer.save_pretrained(OUTPUT_DIR / "final")
+    print("Done.")
+if __name__ == "__main__":
+    main()

sincode_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+SinCode v3 — public API entry point.
+Usage:
+    from sincode_model import BeamSearchDecoder
+    decoder = BeamSearchDecoder()
+    result, logs = decoder.decode("mema videowe bit rate eka godak wadi nisa buffer wenawa")
+"""
+from core.decoder import BeamSearchDecoder, ScoredCandidate             # noqa: F401
+from core.english import ENGLISH_VOCAB  # noqa: F401
+from core.constants import (                                            # noqa: F401
+    DEFAULT_MLM_MODEL, DEFAULT_BYT5_MODEL, DEFAULT_MBART_MODEL,
+    MAX_CANDIDATES, MIN_ENGLISH_LEN,
+)
+from seq2seq.mbart_infer import SentenceTransliterator                  # noqa: F401