amosnbn commited on
Commit
7e72f13
Β·
1 Parent(s): 415e208
Files changed (1) hide show
  1. app.py +19 -67
app.py CHANGED
@@ -16,10 +16,8 @@ log = logging.getLogger("papua-app")
16
 
17
  # ---------- Flask ----------
18
  app = Flask(__name__, template_folder="frontend", static_folder="static")
19
- # trust HF proxy (HTTPS/host), penting untuk cookie Secure & redirect
20
  app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
21
 
22
- # session config (HF Spaces iframe-friendly)
23
  app.config.update(
24
  SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
25
  SESSION_COOKIE_NAME="hfspace_session",
@@ -95,15 +93,12 @@ def login_required(fn):
95
  return fn(*args, **kwargs)
96
  return _wrap
97
 
98
- # ---------- Prenorm (enhanced, controllable via ENV) ----------
99
- # ENV:
100
- # PRENORM_LEVEL = off | basic | aggressive (default: basic)
101
- # PRENORM_DEBUG = 1 to log internal trace (response API tidak berubah)
102
  PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
103
  PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
104
 
105
  WS_RE = re.compile(r"\s+")
106
- ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE) # konsonan >=3x
107
  PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β€”/]|_")
108
  MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
109
  DASH_SPACES= re.compile(r"\s*([-β€”/])\s*")
@@ -122,7 +117,6 @@ PAPUA_MAP = {
122
  "ma": "sama", "deng": "dengan", "dgn": "dengan",
123
  "kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
124
  }
125
-
126
  PRON_MAP = {
127
  "sa": "saya", "saya": "saya",
128
  "ko": "kamu", "kamu": "kamu",
@@ -133,28 +127,21 @@ PRON_MAP = {
133
 
134
  def _normalize_unicode(text: str) -> str:
135
  return unicodedata.normalize("NFKC", text)
136
-
137
  def _strip_emoji_and_noise(text: str) -> str:
138
  text = PUNC_RE.sub(" ", text)
139
  text = MULTI_PUNC.sub(r"\1", text)
140
  text = DASH_SPACES.sub(r" \1 ", text)
141
  return text
142
-
143
  def _normalize_ws(text: str) -> str:
144
  return WS_RE.sub(" ", text).strip()
145
-
146
  def _reduce_elongation(token: str) -> str:
147
  base = token.lower()
148
- if base in WHITELIST_KEEP_ELONG:
149
- return token
150
  return ELONG_RE.sub(r"\1\1", token)
151
-
152
  def _apply_papua_map(token: str) -> str:
153
  low = token.lower()
154
  return PAPUA_MAP.get(low, token)
155
-
156
  def _handle_pu_constructs(text: str) -> str:
157
- # "<pronoun> pu <X>" -> "punya <pronoun_std> <X>"
158
  def repl(m):
159
  pron = m.group(1).lower()
160
  rest = m.group(2).strip()
@@ -162,7 +149,6 @@ def _handle_pu_constructs(text: str) -> str:
162
  return f"punya {pron_std} {rest}"
163
  return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
164
  repl, text, flags=re.IGNORECASE)
165
-
166
  def _token_level_ops(text: str, aggressive: bool) -> str:
167
  tokens = text.split()
168
  out = []
@@ -173,16 +159,9 @@ def _token_level_ops(text: str, aggressive: bool) -> str:
173
  return " ".join(out)
174
 
175
  def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
176
- """
177
- level:
178
- - "off": tanpa perubahan
179
- - "basic": unicode + ws + map slang + 'pu' constructs
180
- - "aggressive": basic + reduksi huruf berulang + pembersihan simbol non-teks
181
- """
182
  trace = {"level": level}
183
  if level == "off":
184
  return (inp, trace) if return_trace else inp
185
-
186
  s0 = inp
187
  s1 = _normalize_unicode(s0)
188
  s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
@@ -190,18 +169,14 @@ def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
190
  s4 = _handle_pu_constructs(s3)
191
  s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
192
  s6 = _normalize_ws(s5)
193
-
194
  if return_trace:
195
- trace.update({
196
- "original": s0, "unicode_norm": s1,
197
- "strip_noise": s2 if level == "aggressive" else "(skip)",
198
- "ws_norm_1": s3, "pu_constructs": s4,
199
- "token_ops": s5, "final": s6,
200
- })
201
  return s6, trace
202
  return s6
203
 
204
- # Wrapper publik yang dipakai endpoint
205
  def prenorm(text: str) -> str:
206
  if PRENORM_DEBUG:
207
  out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
@@ -209,8 +184,7 @@ def prenorm(text: str) -> str:
209
  return out
210
  return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
211
 
212
- # ---------- Model (lazy) + strip BOM + sanitize adapter ----------
213
- # Default diarahkan ke repo kamu di Hugging Face.
214
  BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
215
  ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
216
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
@@ -271,27 +245,15 @@ def _sanitize_adapter_config(adapter_dir: str):
271
  json.dump(cleaned, f, ensure_ascii=False, indent=2)
272
 
273
  def _load_model():
274
- """Download ke /tmp, strip BOM, sanitize adapter_config, lalu load."""
275
  global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
276
  try:
277
  log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
278
-
279
- base_dir = snapshot_download(
280
- repo_id=BASE_MODEL_ID,
281
- local_dir="/tmp/hf_base",
282
- local_dir_use_symlinks=False,
283
- allow_patterns=None,
284
- )
285
  _strip_bom_in_dir(base_dir)
286
 
287
  adapter_dir = None
288
  if ADAPTER_ID:
289
- adapter_dir = snapshot_download(
290
- repo_id=ADAPTER_ID,
291
- local_dir="/tmp/hf_adapter",
292
- local_dir_use_symlinks=False,
293
- allow_patterns=None,
294
- )
295
  _strip_bom_in_dir(adapter_dir)
296
  _sanitize_adapter_config(adapter_dir)
297
 
@@ -301,7 +263,6 @@ def _load_model():
301
 
302
  TOK = AutoTokenizer.from_pretrained(base_dir)
303
  base = AutoModelForSeq2SeqLM.from_pretrained(base_dir)
304
-
305
  MODEL = PeftModel.from_pretrained(base, adapter_dir) if adapter_dir else base
306
  MODEL.eval().to(DEVICE)
307
 
@@ -339,10 +300,8 @@ def translate_with_model(text: str, max_new_tokens: int = 48) -> str:
339
  return tok.decode(out[0], skip_special_tokens=True)
340
 
341
  def _preload_thread():
342
- try:
343
- _load_model()
344
- except Exception:
345
- pass
346
 
347
  if PRELOAD_MODEL:
348
  threading.Thread(target=_preload_thread, daemon=True).start()
@@ -382,17 +341,6 @@ def diag():
382
  "prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
383
  })
384
 
385
- @app.get("/debug/session/set")
386
- def dbg_set():
387
- session.permanent = True
388
- session["uid"] = "test-user"
389
- session["email"] = "test@example.com"
390
- return {"ok": True, "set": True}
391
-
392
- @app.get("/debug/session/get")
393
- def dbg_get():
394
- return {"uid": session.get("uid"), "email": session.get("email")}
395
-
396
  # ---------- Auth & Pages ----------
397
  @app.get("/health")
398
  @app.get("/ping")
@@ -401,6 +349,8 @@ def health():
401
 
402
  @app.get("/login")
403
  def login_get():
 
 
404
  return render_template("login.html")
405
 
406
  @app.post("/login")
@@ -424,6 +374,8 @@ def login_post():
424
 
425
  @app.get("/register")
426
  def register_get():
 
 
427
  return render_template("register.html")
428
 
429
  @app.post("/register")
@@ -440,10 +392,10 @@ def register_post():
440
  return redirect(url_for("register_get"))
441
  u = User(email=email); set_password(u, pwd)
442
  s.add(u); s.commit()
443
- session.permanent = True
444
- session["uid"], session["email"] = u.id, u.email
445
 
446
- return redirect(url_for("index"))
 
 
447
 
448
  @app.get("/logout")
449
  def logout():
 
16
 
17
  # ---------- Flask ----------
18
  app = Flask(__name__, template_folder="frontend", static_folder="static")
 
19
  app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
20
 
 
21
  app.config.update(
22
  SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
23
  SESSION_COOKIE_NAME="hfspace_session",
 
93
  return fn(*args, **kwargs)
94
  return _wrap
95
 
96
+ # ---------- Prenorm ----------
 
 
 
97
  PRENORM_LEVEL = os.getenv("PRENORM_LEVEL", "basic").lower()
98
  PRENORM_DEBUG = os.getenv("PRENORM_DEBUG", "0") == "1"
99
 
100
  WS_RE = re.compile(r"\s+")
101
+ ELONG_RE = re.compile(r"([bcdfghjklmnpqrstvwxyz])\1{2,}", flags=re.IGNORECASE)
102
  PUNC_RE = re.compile(r"[^\w\s,.;:?!%()\-\β€”/]|_")
103
  MULTI_PUNC = re.compile(r"([,.;:?!])\1+")
104
  DASH_SPACES= re.compile(r"\s*([-β€”/])\s*")
 
117
  "ma": "sama", "deng": "dengan", "dgn": "dengan",
118
  "kira2": "kira-kira", "bgmn": "bagaimana", "gmn": "bagaimana",
119
  }
 
120
  PRON_MAP = {
121
  "sa": "saya", "saya": "saya",
122
  "ko": "kamu", "kamu": "kamu",
 
127
 
128
  def _normalize_unicode(text: str) -> str:
129
  return unicodedata.normalize("NFKC", text)
 
130
  def _strip_emoji_and_noise(text: str) -> str:
131
  text = PUNC_RE.sub(" ", text)
132
  text = MULTI_PUNC.sub(r"\1", text)
133
  text = DASH_SPACES.sub(r" \1 ", text)
134
  return text
 
135
  def _normalize_ws(text: str) -> str:
136
  return WS_RE.sub(" ", text).strip()
 
137
  def _reduce_elongation(token: str) -> str:
138
  base = token.lower()
139
+ if base in WHITELIST_KEEP_ELONG: return token
 
140
  return ELONG_RE.sub(r"\1\1", token)
 
141
  def _apply_papua_map(token: str) -> str:
142
  low = token.lower()
143
  return PAPUA_MAP.get(low, token)
 
144
  def _handle_pu_constructs(text: str) -> str:
 
145
  def repl(m):
146
  pron = m.group(1).lower()
147
  rest = m.group(2).strip()
 
149
  return f"punya {pron_std} {rest}"
150
  return re.sub(r"\b(sa|saya|ko|kamu|dia|dong|kam|kalian|kitong|kitorang|kita|torang)\s*pu\s+([^.,;:!?]+)",
151
  repl, text, flags=re.IGNORECASE)
 
152
  def _token_level_ops(text: str, aggressive: bool) -> str:
153
  tokens = text.split()
154
  out = []
 
159
  return " ".join(out)
160
 
161
  def papua_prenorm(inp: str, level: str = "basic", return_trace: bool = False):
 
 
 
 
 
 
162
  trace = {"level": level}
163
  if level == "off":
164
  return (inp, trace) if return_trace else inp
 
165
  s0 = inp
166
  s1 = _normalize_unicode(s0)
167
  s2 = _strip_emoji_and_noise(s1) if level == "aggressive" else s1
 
169
  s4 = _handle_pu_constructs(s3)
170
  s5 = _token_level_ops(s4, aggressive=(level == "aggressive"))
171
  s6 = _normalize_ws(s5)
 
172
  if return_trace:
173
+ trace.update({"original": s0, "unicode_norm": s1,
174
+ "strip_noise": s2 if level == "aggressive" else "(skip)",
175
+ "ws_norm_1": s3, "pu_constructs": s4,
176
+ "token_ops": s5, "final": s6})
 
 
177
  return s6, trace
178
  return s6
179
 
 
180
  def prenorm(text: str) -> str:
181
  if PRENORM_DEBUG:
182
  out, tr = papua_prenorm(text, level=PRENORM_LEVEL, return_trace=True)
 
184
  return out
185
  return papua_prenorm(text, level=PRENORM_LEVEL, return_trace=False)
186
 
187
+ # ---------- Model (lazy) ----------
 
188
  BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "amosnbn/cendol-mt5-base-inst")
189
  ADAPTER_ID = os.getenv("ADAPTER_ID", "amosnbn/papua-lora-ckpt-168")
190
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
 
245
  json.dump(cleaned, f, ensure_ascii=False, indent=2)
246
 
247
  def _load_model():
 
248
  global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
249
  try:
250
  log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
251
+ base_dir = snapshot_download(repo_id=BASE_MODEL_ID, local_dir="/tmp/hf_base", local_dir_use_symlinks=False)
 
 
 
 
 
 
252
  _strip_bom_in_dir(base_dir)
253
 
254
  adapter_dir = None
255
  if ADAPTER_ID:
256
+ adapter_dir = snapshot_download(repo_id=ADAPTER_ID, local_dir="/tmp/hf_adapter", local_dir_use_symlinks=False)
 
 
 
 
 
257
  _strip_bom_in_dir(adapter_dir)
258
  _sanitize_adapter_config(adapter_dir)
259
 
 
263
 
264
  TOK = AutoTokenizer.from_pretrained(base_dir)
265
  base = AutoModelForSeq2SeqLM.from_pretrained(base_dir)
 
266
  MODEL = PeftModel.from_pretrained(base, adapter_dir) if adapter_dir else base
267
  MODEL.eval().to(DEVICE)
268
 
 
300
  return tok.decode(out[0], skip_special_tokens=True)
301
 
302
  def _preload_thread():
303
+ try: _load_model()
304
+ except Exception: pass
 
 
305
 
306
  if PRELOAD_MODEL:
307
  threading.Thread(target=_preload_thread, daemon=True).start()
 
341
  "prenorm": {"level": PRENORM_LEVEL, "debug": PRENORM_DEBUG},
342
  })
343
 
 
 
 
 
 
 
 
 
 
 
 
344
  # ---------- Auth & Pages ----------
345
  @app.get("/health")
346
  @app.get("/ping")
 
349
 
350
  @app.get("/login")
351
  def login_get():
352
+ if session.get("uid"): # sudah login β†’ ke home
353
+ return redirect(url_for("index"))
354
  return render_template("login.html")
355
 
356
  @app.post("/login")
 
374
 
375
  @app.get("/register")
376
  def register_get():
377
+ if session.get("uid"): # sudah login β†’ ke home
378
+ return redirect(url_for("index"))
379
  return render_template("register.html")
380
 
381
  @app.post("/register")
 
392
  return redirect(url_for("register_get"))
393
  u = User(email=email); set_password(u, pwd)
394
  s.add(u); s.commit()
 
 
395
 
396
+ # BEDA DI SINI: tidak auto-login. Wajib login manual.
397
+ flash("Registrasi berhasil. Silakan login.", "success")
398
+ return redirect(url_for("login_get"))
399
 
400
  @app.get("/logout")
401
  def logout():