vithacocf commited on
Commit
57cc5e9
verified
1 Parent(s): db22021

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -1
app.py CHANGED
@@ -22,7 +22,81 @@ try:
22
  RESAMPLE = Image.Resampling.LANCZOS # Pillow >= 10
23
  except AttributeError:
24
  RESAMPLE = Image.LANCZOS # Pillow < 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # ================== HELPERS ==================
27
  import fitz # PyMuPDF
28
 
@@ -248,6 +322,41 @@ def preview_process(file):
248
  print(f"Preview error: {e}")
249
  return []
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  # -------- Internal (Gemini) - Base (1 l瓢峄, kh么ng thinking) --------
252
  def run_process_internal_base(file_bytes, filename, mime, question, model_choice,
253
  temperature, top_p):
@@ -302,6 +411,57 @@ def run_process_internal_base(file_bytes, filename, mime, question, model_choice
302
  genai.delete_file(uploaded.name)
303
  except Exception:
304
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  # -------- External API --------
307
  def run_process_external(file_bytes, filename, mime, question, api_url,
@@ -366,7 +526,7 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
366
  temperature=temperature, top_p=top_p
367
  )
368
 
369
- return run_process_internal_base(
370
  file_bytes=file_bytes, filename=filename, mime=mime,
371
  question=question, model_choice=model_choice,
372
  temperature=temperature, top_p=top_p
 
22
  RESAMPLE = Image.Resampling.LANCZOS # Pillow >= 10
23
  except AttributeError:
24
  RESAMPLE = Image.LANCZOS # Pillow < 10
25
+ PROMPT_FREIGHT_JSON = """
26
+ Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
27
+ {
28
+ "shipping_line": "...",
29
+ "shipping_line_code": "...",
30
+ "shipping_line_reason": "Why this carrier is chosen?",
31
+ "fee_type": "Air Freight",
32
+ "valid_from": ...,
33
+ "valid_to": ...,
34
+ "charges": [
35
+ {
36
+ "frequency": "...",
37
+ "package_type": "...",
38
+ "aircraft_type": "...",
39
+ "direction": "Export or Import or null",
40
+ "origin": "...",
41
+ "destination": "...",
42
+ "charge_name": "...",
43
+ "charge_code": "...",
44
+ "charge_code_reason": "...",
45
+ "cargo_type": "...",
46
+ "currency": "...",
47
+ "transit": "...",
48
+ "transit_time": "...",
49
+ "weight_breaks": {
50
+ "min": ...,
51
+ "-45kg": ...,
52
+ "45kg": ...,
53
+ "100kg": ...,
54
+ "300kg": ...,
55
+ "500kg": ...,
56
+ "1000kg": ...,
57
+ "other": {
58
+ key: value
59
+ }
60
+ },
61
+ "remark": "..."
62
+ }
63
+ ],
64
+ "local_charges": [
65
+ {
66
+ "charge_name": "...",
67
+ "charge_code": "...",
68
+ "unit": "...",
69
+ "amount": ...,
70
+ "remark": "..."
71
+ }
72
+ ]
73
+ }
74
 
75
+ ### Date rules
76
+ - valid_from format:
77
+ - `DD/MM/YYYY` (if full date)
78
+ - `01/MM/YYYY` (if month+year only)
79
+ - `01/01/YYYY` (if year only)
80
+ - `UFN` if missing
81
+ - valid_to:
82
+ - exact `DD/MM/YYYY` if present
83
+ - else `UFN`
84
+
85
+ STRICT RULES:
86
+ - ONLY return a single JSON object as specified above.
87
+ - All rates must exactly match the corresponding weight break columns (45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
88
+ - If the table shows "RQ" or similar, set value as "RQST".
89
+ - Group same-price destinations into one record separated by "/".
90
+ - Always use IATA code for origin and destination.
91
+ - Flight number (e.g. ZH118) is not charge code.
92
+ - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4鈫扗34).
93
+ - If local charges exist, list them.
94
+ - If validity missing, set null.
95
+ - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
96
+ - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
97
+ - Replace commas in remarks with semicolons.
98
+ - Only return JSON.
99
+ """
100
  # ================== HELPERS ==================
101
  import fitz # PyMuPDF
102
 
 
322
  print(f"Preview error: {e}")
323
  return []
324
 
325
+ def _merge_freight_objects(objs: list[dict]) -> dict | None:
326
+ if not objs: return None
327
+ base = {}
328
+ for k in ["shipping_line","shipping_line_code","shipping_line_reason","fee_type","valid_from","valid_to"]:
329
+ for o in objs:
330
+ if isinstance(o, dict) and o.get(k):
331
+ base[k] = o[k]
332
+ break
333
+ base.setdefault(k, None)
334
+
335
+ seen = set()
336
+ merged_charges, merged_local = [], []
337
+ def norm(v): return v.replace(",", ";") if isinstance(v, str) else v
338
+
339
+ for o in objs:
340
+ for c in (o.get("charges") or []):
341
+ wb = json.dumps(c.get("weight_breaks", {}), sort_keys=True, ensure_ascii=False)
342
+ key = (c.get("origin"), c.get("destination"), c.get("charge_name"), c.get("charge_code"), c.get("currency"), wb)
343
+ if key in seen: continue
344
+ c["remark"] = norm(c.get("remark"))
345
+ merged_charges.append(c)
346
+ seen.add(key)
347
+ for lc in (o.get("local_charges") or []):
348
+ lc["remark"] = norm(lc.get("remark"))
349
+ merged_local.append(lc)
350
+
351
+ base["charges"] = merged_charges
352
+ base["local_charges"] = merged_local
353
+ return base
354
+ def _coerce_only_json(text: str) -> str:
355
+ obj, s = _extract_json_from_message(text)
356
+ if obj is not None:
357
+ return json.dumps(obj, ensure_ascii=False)
358
+ m = re.search(r"\{.*\}\s*$", text, flags=re.DOTALL)
359
+ return m.group(0) if m else text.strip()
360
  # -------- Internal (Gemini) - Base (1 l瓢峄, kh么ng thinking) --------
361
  def run_process_internal_base(file_bytes, filename, mime, question, model_choice,
362
  temperature, top_p):
 
411
  genai.delete_file(uploaded.name)
412
  except Exception:
413
  pass
414
+ # ================== MAIN OCR FUNCTION ==================
415
+ def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
416
+ api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
417
+ if not api_key:
418
+ return "ERROR: Missing GOOGLE_API_KEY.", None
419
+ genai.configure(api_key=api_key)
420
+
421
+ model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
422
+ model = genai.GenerativeModel(model_name=model_name, generation_config={"temperature": float(temperature), "top_p": float(top_p)})
423
+
424
+ if file_bytes[:4] == b"%PDF":
425
+ pages = pdf_to_images(file_bytes)
426
+ else:
427
+ pages = [Image.open(io.BytesIO(file_bytes))]
428
+
429
+ user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
430
+ all_json_results, all_text_results = [], []
431
+
432
+ for i in range(0, len(pages), batch_size):
433
+ batch = pages[i:i+batch_size]
434
+ uploaded = []
435
+ for im in batch:
436
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
437
+ im.save(tmp.name)
438
+ up = genai.upload_file(path=tmp.name, mime_type="image/png")
439
+ up = _wait_file_active(up)
440
+ uploaded.append(up)
441
+ try:
442
+ prompt = f"{user_prompt}\n(This is batch {i//batch_size+1})"
443
+ resp = model.generate_content([prompt] + uploaded)
444
+ text = _safe_text_from_gemini(resp)
445
+ json_text = _coerce_only_json(text)
446
+ try:
447
+ parsed = json.loads(json_text)
448
+ all_json_results.append(parsed)
449
+ except Exception:
450
+ all_text_results.append(text)
451
+ finally:
452
+ for up in uploaded:
453
+ try: genai.delete_file(up.name)
454
+ except: pass
455
+
456
+ if all_json_results:
457
+ merged_json = _merge_freight_objects(all_json_results)
458
+ message = json.dumps(merged_json, ensure_ascii=False, indent=2)
459
+ return message, merged_json
460
+
461
+ combined_text = "\n\n".join(all_text_results)
462
+ message = _pretty_message(combined_text)
463
+ parsed_obj, _ = _extract_json_from_message(combined_text)
464
+ return message, parsed_obj
465
 
466
  # -------- External API --------
467
  def run_process_external(file_bytes, filename, mime, question, api_url,
 
526
  temperature=temperature, top_p=top_p
527
  )
528
 
529
+ return run_process_internal_base_v2(
530
  file_bytes=file_bytes, filename=filename, mime=mime,
531
  question=question, model_choice=model_choice,
532
  temperature=temperature, top_p=top_p