amosnbn commited on
Commit
ce1798c
Β·
1 Parent(s): 6c56332

ganti model

Browse files
Files changed (1) hide show
  1. app.py +117 -12
app.py CHANGED
@@ -3,7 +3,7 @@
3
  # Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback,
4
  # strip BOM JSON, sanitize adapter_config, endpoint /history + homepage responsive.
5
 
6
- import os, re, json, codecs, pathlib, logging, threading, traceback, inspect
7
  from datetime import datetime, timezone, timedelta
8
  from functools import wraps
9
  from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
@@ -93,20 +93,124 @@ def login_required(fn):
93
  return fn(*args, **kwargs)
94
  return _wrap
95
 
96
- # ---------- Prenorm ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  PAPUA_MAP = {
98
- r"\bsa\b": "saya", r"\bko\b": "kamu", r"\btra\b": "tidak", r"\bndak\b": "tidak",
99
- r"\bmo\b": "mau", r"\bpu\b": "punya", r"\bsu\b": "sudah", r"\bkong\b": "kemudian",
 
 
 
 
 
 
 
 
100
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def prenorm(text: str) -> str:
102
- t = re.sub(r"\s+", " ", text.strip())
103
- t = t.replace("…","...").replace("–","-").replace("β€”","-")
104
- for pat, repl in PAPUA_MAP.items(): t = re.sub(pat, repl, t, flags=re.IGNORECASE)
105
- return t
 
106
 
107
  # ---------- Model (lazy) + strip BOM + sanitize adapter ----------
108
- BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "google/mt5-small")
109
- ADAPTER_ID = os.getenv("ADAPTER_ID", "")
 
110
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
111
 
112
  TOK = None
@@ -271,6 +375,7 @@ def diag():
271
  "model_error": _MODEL_ERROR,
272
  "versions": {"python": sys.version, "torch": torch_v, "transformers": tf_v, "peft": peft_v},
273
  "preload": PRELOAD_MODEL,
 
274
  })
275
 
276
  @app.get("/debug/session/set")
@@ -316,7 +421,7 @@ def register_get():
316
  def register_post():
317
  email = (request.form.get("email") or "").strip().lower()
318
  pwd = (request.form.get("password") or "")
319
- if not email or not pwd:
320
  flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
321
  with SessionLocal() as s:
322
  if s.query(User).filter_by(email=email).first():
@@ -372,7 +477,7 @@ def api_translate():
372
  if not text:
373
  return jsonify({"ok": False, "error": "Empty text"}), 400
374
  try:
375
- clean = prenorm(text)
376
  mt = f"[FAKE] {clean}" if FALLBACK_TRANSLATE else translate_with_model(clean, max_new_tokens=max_new)
377
  with SessionLocal() as s:
378
  s.add(Translation(user_id=session["uid"], src=text, mt=mt))
 
3
  # Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback,
4
  # strip BOM JSON, sanitize adapter_config, endpoint /history + homepage responsive.
5
 
6
+ import os, re, json, codecs, pathlib, logging, threading, traceback, inspect, unicodedata
7
  from datetime import datetime, timezone, timedelta
8
  from functools import wraps
9
  from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
 
93
  return fn(*args, **kwargs)
94
  return _wrap
95
 
96
+ # ---------- Prenorm (enhanced, controllable via ENV) ----------
97
+ # ENV:
98
+ # PRENORM_LEVEL = off | basic | aggressive (default: basic)
99
+ # PRENORM_DEBUG = 1 to log internal trace (response API tidak berubah)
100
+ PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
101
+ PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
102
+
103
+ WS_RE = re.compile(r"\s+")
104
+ ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE) # konsonan >=3x
105
+ PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β€”/]|_")
106
+ MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
107
+ DASH_SPACES= re.compile(r"\s*([-β€”/])\s*")
108
+
109
+ WHITELIST_KEEP_ELONG = {"papua", "koteka", "wamena", "sarmi", "sorong"}
110
+
111
  PAPUA_MAP = {
112
+ "sa": "saya", "sy": "saya", "beta": "saya",
113
+ "ko": "kamu", "kau": "kamu",
114
+ "dong": "mereka", "kam": "kalian", "kamong": "kalian",
115
+ "kitong": "kita", "kitorang": "kita", "torang": "kita",
116
+ "tra": "tidak", "tr": "tidak", "trada": "tidak ada", "son": "tidak", "ndak": "tidak", "tid": "tidak",
117
+ "mo": "mau", "su": "sudah", "uda": "sudah",
118
+ "skarang": "sekarang", "td": "tadi", "tar": "nanti", "tarlah": "nanti",
119
+ "pigi": "pergi", "pi": "pergi",
120
+ "ma": "sama", "deng": "dengan", "dgn": "dengan",
121
+ "kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
122
  }
123
+
124
+ PRON_MAP = {
125
+ "sa": "saya", "saya": "saya",
126
+ "ko": "kamu", "kamu": "kamu",
127
+ "dia": "dia", "dong": "mereka",
128
+ "kam": "kalian", "kalian": "kalian",
129
+ "kitong": "kita", "kitorang": "kita", "kita": "kita", "torang": "kita",
130
+ }
131
+
132
+ def _normalize_unicode(text: str) -> str:
133
+ return unicodedata.normalize("NFKC", text)
134
+
135
+ def _strip_emoji_and_noise(text: str) -> str:
136
+ text = PUNC_RE.sub(" ", text)
137
+ text = MULTI_PUNC.sub(r"\1", text)
138
+ text = DASH_SPACES.sub(r" \1 ", text)
139
+ return text
140
+
141
+ def _normalize_ws(text: str) -> str:
142
+ return WS_RE.sub(" ", text).strip()
143
+
144
+ def _reduce_elongation(token: str) -> str:
145
+ base = token.lower()
146
+ if base in WHITELIST_KEEP_ELONG:
147
+ return token
148
+ return ELONG_RE.sub(r"\1\1", token)
149
+
150
+ def _apply_papua_map(token: str) -> str:
151
+ low = token.lower()
152
+ return PAPUA_MAP.get(low, token)
153
+
154
+ def _handle_pu_constructs(text: str) -> str:
155
+ # "<pronoun> pu <X>" -> "punya <pronoun_std> <X>"
156
+ def repl(m):
157
+ pron = m.group(1).lower()
158
+ rest = m.group(2).strip()
159
+ pron_std = PRON_MAP.get(pron, pron)
160
+ return f"punya {pron_std} {rest}"
161
+ return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
162
+ repl, text, flags=re.IGNORECASE)
163
+
164
+ def _token_level_ops(text: str, aggressive: bool) -> str:
165
+ tokens = text.split()
166
+ out = []
167
+ for t in tokens:
168
+ t2 = _reduce_elongation(t) if aggressive else t
169
+ t3 = _apply_papua_map(t2)
170
+ out.append(t3)
171
+ return " ".join(out)
172
+
173
+ def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
174
+ """
175
+ level:
176
+ - "off": tanpa perubahan
177
+ - "basic": unicode + ws + map slang + 'pu' constructs
178
+ - "aggressive": basic + reduksi huruf berulang + pembersihan simbol non-teks
179
+ """
180
+ trace = {"level": level}
181
+ if level == "off":
182
+ return (inp, trace) if return_trace else inp
183
+
184
+ s0 = inp
185
+ s1 = _normalize_unicode(s0)
186
+ s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
187
+ s3 = _normalize_ws(s2)
188
+ s4 = _handle_pu_constructs(s3)
189
+ s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
190
+ s6 = _normalize_ws(s5)
191
+
192
+ if return_trace:
193
+ trace.update({
194
+ "original": s0, "unicode_norm": s1,
195
+ "strip_noise": s2 if level == "aggressive" else "(skip)",
196
+ "ws_norm_1": s3, "pu_constructs": s4,
197
+ "token_ops": s5, "final": s6,
198
+ })
199
+ return s6, trace
200
+ return s6
201
+
202
+ # Wrapper publik yang dipakai endpoint (signature sama seperti sebelumnya)
203
  def prenorm(text: str) -> str:
204
+ if PRENORM_DEBUG:
205
+ out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
206
+ log.info("[PRENORM][%s] %s -> %s | trace=%s", PRENORM_LEVEL, text, out, json.dumps(tr, ensure_ascii=False))
207
+ return out
208
+ return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
209
 
210
  # ---------- Model (lazy) + strip BOM + sanitize adapter ----------
211
+ # Default diarahkan ke repo kamu di Hugging Face.
212
+ BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
213
+ ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
214
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
215
 
216
  TOK = None
 
375
  "model_error": _MODEL_ERROR,
376
  "versions": {"python": sys.version, "torch": torch_v, "transformers": tf_v, "peft": peft_v},
377
  "preload": PRELOAD_MODEL,
378
+ "prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
379
  })
380
 
381
  @app.get("/debug/session/set")
 
421
  def register_post():
422
  email = (request.form.get("email") or "").strip().lower()
423
  pwd = (request.form.get("password") or "")
424
+ if not email atau not pwd:
425
  flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
426
  with SessionLocal() as s:
427
  if s.query(User).filter_by(email=email).first():
 
477
  if not text:
478
  return jsonify({"ok": False, "error": "Empty text"}), 400
479
  try:
480
+ clean = prenorm(text) # <- tetap sama namanya, sekarang lebih pintar
481
  mt = f"[FAKE] {clean}" if FALLBACK_TRANSLATE else translate_with_model(clean, max_new_tokens=max_new)
482
  with SessionLocal() as s:
483
  s.add(Translation(user_id=session["uid"], src=text, mt=mt))