Princeaka commited on
Commit
5a4f66f
·
verified ·
1 Parent(s): 108b140

Update language.py

Browse files
Files changed (1) hide show
  1. language.py +52 -51
language.py CHANGED
@@ -1,20 +1,9 @@
1
- """
2
- language.py small wrapper to expose a stable translation API for JusticeAI.
3
-
4
- Place this file next to your language.bin. The wrapper will try to load language.bin
5
- (via torch.load first, then via pickle) and adapt to common shapes:
6
-
7
- - object.translate(text, src, tgt) or object.translate_to_en(text, src)
8
- - object(text, src, tgt) if the loaded object is callable
9
- - dict-like mapping with keys (('src','tgt') -> function/string)
10
- - HF-like model object with .generate and an attached tokenizer attribute
11
-
12
- Functions exposed:
13
- - translate(text, src, tgt)
14
- - translate_to_en(text, src)
15
- - translate_from_en(text, tgt)
16
- """
17
-
18
  from pathlib import Path
19
  import logging
20
  import pickle
@@ -54,12 +43,23 @@ def _ensure_loaded():
54
  else:
55
  logger.info("language.bin not found in cwd")
56
 
57
- # Generic translator interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def translate(text: str, src: str, tgt: str) -> str:
59
- """
60
- Translate text from src -> tgt using the loaded model.
61
- If nothing useful is found, returns the original text.
62
- """
63
  _ensure_loaded()
64
  if not text:
65
  return text
@@ -69,12 +69,11 @@ def translate(text: str, src: str, tgt: str) -> str:
69
  # 1) object has translate(text, src, tgt) or translate_to_en
70
  try:
71
  if hasattr(_model, "translate"):
72
- # try both signatures (text, src, tgt) or (text, src_tgt) etc.
73
  try:
74
  return _model.translate(text, src, tgt)
75
  except TypeError:
76
  try:
77
- return _model.translate(text, src + "->" + tgt)
78
  except Exception:
79
  pass
80
  if hasattr(_model, "translate_to_en") and tgt.lower() in ("en", "eng"):
@@ -113,7 +112,6 @@ def translate(text: str, src: str, tgt: str) -> str:
113
  return fn(text)
114
  if isinstance(fn, str):
115
  return fn
116
- # fallback to direct mapping by lang codes
117
  key2 = f"{src}->{tgt}"
118
  if key2 in _model:
119
  val = _model[key2]
@@ -124,20 +122,15 @@ def translate(text: str, src: str, tgt: str) -> str:
124
  except Exception as e:
125
  logger.debug(f"dict-like model attempt failed: {e}")
126
 
127
- # 4) HF-style object: has .generate and maybe a tokenizer at _model.tokenizer
128
  try:
129
- tokenizer = None
130
  m = _model
131
- if hasattr(m, "generate") and hasattr(m, "to") or hasattr(m, "device"):
132
- # try to locate a tokenizer attached to model
133
- tokenizer = getattr(m, "tokenizer", None)
134
- if tokenizer is None and hasattr(m, "config") and hasattr(m, "config", "tokenizer_class"):
135
- tokenizer = getattr(m, "tokenizer", None)
136
- if tokenizer:
137
- inputs = tokenizer([text], return_tensors="pt", truncation=True)
138
- outputs = m.generate(**inputs, max_length=1024)
139
- decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
140
- return decoded
141
  except Exception as e:
142
  logger.debug(f"HF-like model attempt failed: {e}")
143
 
@@ -147,12 +140,10 @@ def translate(text: str, src: str, tgt: str) -> str:
147
  def translate_to_en(text: str, src: str) -> str:
148
  if not text:
149
  return text
150
- # prefer an explicit helper if model provides it
151
  _ensure_loaded()
152
- if _model is not None:
153
  try:
154
- if hasattr(_model, "translate_to_en"):
155
- return _model.translate_to_en(text, src)
156
  except Exception:
157
  pass
158
  return translate(text, src, "en")
@@ -161,22 +152,32 @@ def translate_from_en(text: str, tgt: str) -> str:
161
  if not text:
162
  return text
163
  _ensure_loaded()
164
- if _model is not None:
165
  try:
166
- if hasattr(_model, "translate_from_en"):
167
- return _model.translate_from_en(text, tgt)
168
  except Exception:
169
  pass
170
  return translate(text, "en", tgt)
171
 
172
- # Simple CLI test helpers (run `python language.py` to exercise)
 
 
 
 
 
 
 
 
 
 
 
 
173
  if __name__ == "__main__":
 
174
  import sys
175
  _ensure_loaded()
 
176
  if len(sys.argv) >= 4:
177
- _, src, tgt, txt = sys.argv[0], sys.argv[1], sys.argv[2], " ".join(sys.argv[3:])
178
- out = translate(txt, src, tgt)
179
- print(out)
180
- else:
181
- print("Usage: python language.py <src> <tgt> <text...>")
182
- print("Example: python language.py es en 'hola mundo'")
 
1
+ # language.py — wrapper to expose a stable translation API for JusticeAI.
2
+ # Tries to load language.bin (torch.load then pickle). Adapts common shapes and exposes:
3
+ # - translate(text, src, tgt)
4
+ # - translate_to_en(text, src)
5
+ # - translate_from_en(text, tgt)
6
+ # Also exposes model_info() for debugging/inspection.
 
 
 
 
 
 
 
 
 
 
 
7
  from pathlib import Path
8
  import logging
9
  import pickle
 
43
  else:
44
  logger.info("language.bin not found in cwd")
45
 
46
+ def model_info():
47
+ _ensure_loaded()
48
+ if _model is None:
49
+ return {"loaded": False}
50
+ info = {"loaded": True, "type": type(_model).__name__}
51
+ try:
52
+ info["repr"] = repr(_model)[:800]
53
+ except Exception:
54
+ info["repr"] = "<unreprable>"
55
+ info["has_translate"] = hasattr(_model, "translate")
56
+ info["has_translate_to_en"] = hasattr(_model, "translate_to_en")
57
+ info["has_translate_from_en"] = hasattr(_model, "translate_from_en")
58
+ info["callable"] = callable(_model)
59
+ info["dir"] = [n for n in dir(_model) if not n.startswith("_")]
60
+ return info
61
+
62
  def translate(text: str, src: str, tgt: str) -> str:
 
 
 
 
63
  _ensure_loaded()
64
  if not text:
65
  return text
 
69
  # 1) object has translate(text, src, tgt) or translate_to_en
70
  try:
71
  if hasattr(_model, "translate"):
 
72
  try:
73
  return _model.translate(text, src, tgt)
74
  except TypeError:
75
  try:
76
+ return _model.translate(text, f"{src}->{tgt}")
77
  except Exception:
78
  pass
79
  if hasattr(_model, "translate_to_en") and tgt.lower() in ("en", "eng"):
 
112
  return fn(text)
113
  if isinstance(fn, str):
114
  return fn
 
115
  key2 = f"{src}->{tgt}"
116
  if key2 in _model:
117
  val = _model[key2]
 
122
  except Exception as e:
123
  logger.debug(f"dict-like model attempt failed: {e}")
124
 
125
+ # 4) HF-like object: has .generate and maybe a tokenizer at _model.tokenizer
126
  try:
 
127
  m = _model
128
+ tokenizer = getattr(m, "tokenizer", None)
129
+ if tokenizer and hasattr(m, "generate"):
130
+ inputs = tokenizer([text], return_tensors="pt", truncation=True)
131
+ outputs = m.generate(**inputs, max_length=1024)
132
+ decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
133
+ return decoded
 
 
 
 
134
  except Exception as e:
135
  logger.debug(f"HF-like model attempt failed: {e}")
136
 
 
140
  def translate_to_en(text: str, src: str) -> str:
141
  if not text:
142
  return text
 
143
  _ensure_loaded()
144
+ if _model is not None and hasattr(_model, "translate_to_en"):
145
  try:
146
+ return _model.translate_to_en(text, src)
 
147
  except Exception:
148
  pass
149
  return translate(text, src, "en")
 
152
  if not text:
153
  return text
154
  _ensure_loaded()
155
+ if _model is not None and hasattr(_model, "translate_from_en"):
156
  try:
157
+ return _model.translate_from_en(text, tgt)
 
158
  except Exception:
159
  pass
160
  return translate(text, "en", tgt)
161
 
162
+ # Optional: expose a detect function if the model has one, else None
163
+ def detect_language(text: str) -> str:
164
+ _ensure_loaded()
165
+ if _model is None:
166
+ return None
167
+ for candidate in ("detect", "detect_language", "lang", "language"):
168
+ if hasattr(_model, candidate):
169
+ try:
170
+ return getattr(_model, candidate)(text)
171
+ except Exception:
172
+ pass
173
+ return None
174
+
175
  if __name__ == "__main__":
176
+ # simple CLI debug
177
  import sys
178
  _ensure_loaded()
179
+ print("model_info:", model_info())
180
  if len(sys.argv) >= 4:
181
+ _, src, tgt, *txt = sys.argv
182
+ txt = " ".join(txt)
183
+ print("translate:", translate(txt, src, tgt))