BinKhoaLe1812 commited on
Commit
337fac1
·
verified ·
1 Parent(s): 13f8f13
Files changed (1) hide show
  1. utils/translation.py +60 -6
utils/translation.py CHANGED
@@ -1,6 +1,8 @@
1
  # translation.py
2
  from transformers import pipeline
3
  import logging
 
 
4
 
5
  logger = logging.getLogger("translation-agent")
6
  logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
@@ -9,18 +11,70 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(l
9
  vi_en = None
10
  zh_en = None
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def translate_query(text: str, lang_code: str) -> str:
13
  global vi_en, zh_en
14
  if lang_code == "vi":
15
  if vi_en is None:
16
  vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
17
- result = vi_en(text, max_length=512)[0]["translation_text"]
18
- logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {result}")
19
- return result
 
 
 
 
 
20
  elif lang_code == "zh":
21
  if zh_en is None:
22
  zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
23
- result = zh_en(text, max_length=512)[0]["translation_text"]
24
- logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {result}")
25
- return result
 
 
 
 
 
26
  return text
 
1
  # translation.py
2
  from transformers import pipeline
3
  import logging
4
+ import re
5
+ from collections import Counter
6
 
7
  logger = logging.getLogger("translation-agent")
8
  logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
 
11
  vi_en = None
12
  zh_en = None
13
 
14
+ def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
15
+ """Collapse excessive repeated n-grams (3..7) and repeated phrases."""
16
+ if not s:
17
+ return s
18
+ # Collapse repeated spaces/newlines
19
+ s = re.sub(r"\s+", " ", s).strip()
20
+ # Heuristic: remove runs of identical tokens
21
+ tokens = s.split()
22
+ out = []
23
+ last = None
24
+ for t in tokens:
25
+ if last is None or t.lower() != last.lower():
26
+ out.append(t)
27
+ last = t
28
+ s2 = " ".join(out)
29
+ # Limit consecutive duplicate n-grams
30
+ for n in range(n_max, n_min - 1, -1):
31
+ pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
32
+ s2 = pattern.sub(r"\1", s2)
33
+ return s2
34
+
35
+
36
+ def _normalize_and_cap(s: str, cap: int = 512) -> str:
37
+ if not s:
38
+ return s
39
+ s = s.strip()
40
+ if len(s) > cap:
41
+ s = s[:cap]
42
+ return s
43
+
44
+
45
+ def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
46
+ if not s:
47
+ return False
48
+ tokens = [t.lower() for t in s.split()]
49
+ if len(tokens) < 10:
50
+ return False
51
+ counts = Counter(tokens)
52
+ top = counts.most_common(1)[0][1]
53
+ return (top / max(1, len(tokens))) >= threshold
54
+
55
+
56
  def translate_query(text: str, lang_code: str) -> str:
57
  global vi_en, zh_en
58
  if lang_code == "vi":
59
  if vi_en is None:
60
  vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
61
+ raw = vi_en(text, max_length=512)[0]["translation_text"]
62
+ cleaned = _dedupe_repeats(raw)
63
+ norm = _normalize_and_cap(cleaned, cap=512)
64
+ if _is_too_repetitive(norm):
65
+ logger.warning("[En-Vi] Translation repetitive; falling back to original text")
66
+ norm = text
67
+ logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm}")
68
+ return norm
69
  elif lang_code == "zh":
70
  if zh_en is None:
71
  zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
72
+ raw = zh_en(text, max_length=512)[0]["translation_text"]
73
+ cleaned = _dedupe_repeats(raw)
74
+ norm = _normalize_and_cap(cleaned, cap=512)
75
+ if _is_too_repetitive(norm):
76
+ logger.warning("[En-Zh] Translation repetitive; falling back to original text")
77
+ norm = text
78
+ logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm}")
79
+ return norm
80
  return text