Ethosoft
/

NedoTurkishTokenizer

@@ -3,9 +3,28 @@
 from __future__ import annotations
 import re
 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
 def _turkish_lower(s: str) -> str:
     """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
@@ -36,16 +55,30 @@ _CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
 def _is_turkish_base(word: str) -> bool:
     """Return True if the word should be treated as Turkish (don't split apostrophe)."""
     # Fast path: Turkish-specific characters → definitely Turkish
-    if any(c in TR_CHARS for c in word):
         return True
     # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
     from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
     tdk = load_tdk_words()
-    if tdk:
-        return word.lower() in tdk
-    # TDK unavailable: very short words are ambiguous — leave them alone
-    return len(word) < 4
 # ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────

 from __future__ import annotations
 import re
+from pathlib import Path
 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
+_PROPER_NOUNS: set[str] | None = None
+def _load_proper_nouns() -> set[str]:
+    global _PROPER_NOUNS
+    if _PROPER_NOUNS is not None:
+        return _PROPER_NOUNS
+    path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
+    if path.exists():
+        _PROPER_NOUNS = {
+            line.strip().lower()
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip() and not line.startswith("#")
+        }
+    else:
+        _PROPER_NOUNS = set()
+    return _PROPER_NOUNS
 def _turkish_lower(s: str) -> str:
     """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
 def _is_turkish_base(word: str) -> bool:
     """Return True if the word should be treated as Turkish (don't split apostrophe)."""
+    wl = _turkish_lower(word)
     # Fast path: Turkish-specific characters → definitely Turkish
+    if any(c in TR_CHARS for c in wl):
+        return True
+    # Turkish proper nouns (cities, regions) — not in TDK common-word list
+    if wl in _load_proper_nouns():
         return True
     # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
     from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
     tdk = load_tdk_words()
+    if tdk and wl in tdk:
+        return True
+    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
+    try:
+        from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
+        if ZEMBEREK_AVAILABLE and _morphology:
+            for analysis in _morphology.analyze(wl):
+                lemma = str(analysis).split("]")[0].lstrip("[")
+                if any(c in TR_CHARS for c in lemma):
+                    return True
+    except Exception:  # noqa: BLE001
+        pass
+    # TDK unavailable + Zemberek unavailable: very short words are ambiguous
+    return len(wl) < 4
 # ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────