nmstech commited on
Commit
8f794ec
Β·
verified Β·
1 Parent(s): 183e656

Upload turk_tokenizer/_preprocessor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. turk_tokenizer/_preprocessor.py +38 -5
turk_tokenizer/_preprocessor.py CHANGED
@@ -3,9 +3,28 @@
3
  from __future__ import annotations
4
 
5
  import re
 
6
 
7
  TR_CHARS = set("Γ§ΔŸΔ±ΕŸΓΆΓΌΓ‡ΔžΔ°ΕžΓ–Γœ")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def _turkish_lower(s: str) -> str:
11
  """Turkish-aware lowercase: Δ°β†’i, Iβ†’Δ± (not i), then standard lower."""
@@ -36,16 +55,30 @@ _CAPS_RE = re.compile(r'\b([A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,})\b')
36
 
37
  def _is_turkish_base(word: str) -> bool:
38
  """Return True if the word should be treated as Turkish (don't split apostrophe)."""
 
39
  # Fast path: Turkish-specific characters β†’ definitely Turkish
40
- if any(c in TR_CHARS for c in word):
 
 
 
41
  return True
42
  # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
43
  from ._tdk_vocab import load_tdk_words # noqa: PLC0415
44
  tdk = load_tdk_words()
45
- if tdk:
46
- return word.lower() in tdk
47
- # TDK unavailable: very short words are ambiguous β€” leave them alone
48
- return len(word) < 4
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  # ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
 
3
  from __future__ import annotations
4
 
5
  import re
6
+ from pathlib import Path
7
 
8
  TR_CHARS = set("Γ§ΔŸΔ±ΕŸΓΆΓΌΓ‡ΔžΔ°ΕžΓ–Γœ")
9
 
10
+ _PROPER_NOUNS: set[str] | None = None
11
+
12
+
13
+ def _load_proper_nouns() -> set[str]:
14
+ global _PROPER_NOUNS
15
+ if _PROPER_NOUNS is not None:
16
+ return _PROPER_NOUNS
17
+ path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
18
+ if path.exists():
19
+ _PROPER_NOUNS = {
20
+ line.strip().lower()
21
+ for line in path.read_text(encoding="utf-8").splitlines()
22
+ if line.strip() and not line.startswith("#")
23
+ }
24
+ else:
25
+ _PROPER_NOUNS = set()
26
+ return _PROPER_NOUNS
27
+
28
 
29
  def _turkish_lower(s: str) -> str:
30
  """Turkish-aware lowercase: Δ°β†’i, Iβ†’Δ± (not i), then standard lower."""
 
55
 
56
  def _is_turkish_base(word: str) -> bool:
57
  """Return True if the word should be treated as Turkish (don't split apostrophe)."""
58
+ wl = _turkish_lower(word)
59
  # Fast path: Turkish-specific characters β†’ definitely Turkish
60
+ if any(c in TR_CHARS for c in wl):
61
+ return True
62
+ # Turkish proper nouns (cities, regions) β€” not in TDK common-word list
63
+ if wl in _load_proper_nouns():
64
  return True
65
  # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
66
  from ._tdk_vocab import load_tdk_words # noqa: PLC0415
67
  tdk = load_tdk_words()
68
+ if tdk and wl in tdk:
69
+ return True
70
+ # Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir…)
71
+ try:
72
+ from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
+ if ZEMBEREK_AVAILABLE and _morphology:
74
+ for analysis in _morphology.analyze(wl):
75
+ lemma = str(analysis).split("]")[0].lstrip("[")
76
+ if any(c in TR_CHARS for c in lemma):
77
+ return True
78
+ except Exception: # noqa: BLE001
79
+ pass
80
+ # TDK unavailable + Zemberek unavailable: very short words are ambiguous
81
+ return len(wl) < 4
82
 
83
 
84
  # ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────