Spaces:
Running
Running
normalise
Browse files- utils/translation.py +60 -6
utils/translation.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
# translation.py
|
| 2 |
from transformers import pipeline
|
| 3 |
import logging
|
|
|
|
|
|
|
| 4 |
|
| 5 |
logger = logging.getLogger("translation-agent")
|
| 6 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
|
|
@@ -9,18 +11,70 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(l
|
|
| 9 |
vi_en = None
|
| 10 |
zh_en = None
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def translate_query(text: str, lang_code: str) -> str:
|
| 13 |
global vi_en, zh_en
|
| 14 |
if lang_code == "vi":
|
| 15 |
if vi_en is None:
|
| 16 |
vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
elif lang_code == "zh":
|
| 21 |
if zh_en is None:
|
| 22 |
zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
return text
|
|
|
|
| 1 |
# translation.py
|
| 2 |
from transformers import pipeline
|
| 3 |
import logging
|
| 4 |
+
import re
|
| 5 |
+
from collections import Counter
|
| 6 |
|
| 7 |
logger = logging.getLogger("translation-agent")
|
| 8 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
|
|
|
|
| 11 |
vi_en = None
|
| 12 |
zh_en = None
|
| 13 |
|
| 14 |
+
def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
|
| 15 |
+
"""Collapse excessive repeated n-grams (3..7) and repeated phrases."""
|
| 16 |
+
if not s:
|
| 17 |
+
return s
|
| 18 |
+
# Collapse repeated spaces/newlines
|
| 19 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 20 |
+
# Heuristic: remove runs of identical tokens
|
| 21 |
+
tokens = s.split()
|
| 22 |
+
out = []
|
| 23 |
+
last = None
|
| 24 |
+
for t in tokens:
|
| 25 |
+
if last is None or t.lower() != last.lower():
|
| 26 |
+
out.append(t)
|
| 27 |
+
last = t
|
| 28 |
+
s2 = " ".join(out)
|
| 29 |
+
# Limit consecutive duplicate n-grams
|
| 30 |
+
for n in range(n_max, n_min - 1, -1):
|
| 31 |
+
pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
|
| 32 |
+
s2 = pattern.sub(r"\1", s2)
|
| 33 |
+
return s2
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _normalize_and_cap(s: str, cap: int = 512) -> str:
|
| 37 |
+
if not s:
|
| 38 |
+
return s
|
| 39 |
+
s = s.strip()
|
| 40 |
+
if len(s) > cap:
|
| 41 |
+
s = s[:cap]
|
| 42 |
+
return s
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
|
| 46 |
+
if not s:
|
| 47 |
+
return False
|
| 48 |
+
tokens = [t.lower() for t in s.split()]
|
| 49 |
+
if len(tokens) < 10:
|
| 50 |
+
return False
|
| 51 |
+
counts = Counter(tokens)
|
| 52 |
+
top = counts.most_common(1)[0][1]
|
| 53 |
+
return (top / max(1, len(tokens))) >= threshold
|
| 54 |
+
|
| 55 |
+
|
| 56 |
def translate_query(text: str, lang_code: str) -> str:
|
| 57 |
global vi_en, zh_en
|
| 58 |
if lang_code == "vi":
|
| 59 |
if vi_en is None:
|
| 60 |
vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
|
| 61 |
+
raw = vi_en(text, max_length=512)[0]["translation_text"]
|
| 62 |
+
cleaned = _dedupe_repeats(raw)
|
| 63 |
+
norm = _normalize_and_cap(cleaned, cap=512)
|
| 64 |
+
if _is_too_repetitive(norm):
|
| 65 |
+
logger.warning("[En-Vi] Translation repetitive; falling back to original text")
|
| 66 |
+
norm = text
|
| 67 |
+
logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm}")
|
| 68 |
+
return norm
|
| 69 |
elif lang_code == "zh":
|
| 70 |
if zh_en is None:
|
| 71 |
zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
|
| 72 |
+
raw = zh_en(text, max_length=512)[0]["translation_text"]
|
| 73 |
+
cleaned = _dedupe_repeats(raw)
|
| 74 |
+
norm = _normalize_and_cap(cleaned, cap=512)
|
| 75 |
+
if _is_too_repetitive(norm):
|
| 76 |
+
logger.warning("[En-Zh] Translation repetitive; falling back to original text")
|
| 77 |
+
norm = text
|
| 78 |
+
logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm}")
|
| 79 |
+
return norm
|
| 80 |
return text
|