Spaces:
Paused
Paused
| from __future__ import annotations | |
| import os, io, re, json, time, mimetypes, tempfile | |
| from typing import List, Union, Tuple | |
| from PIL import Image | |
| import pandas as pd | |
| import gradio as gr | |
| import google.generativeai as genai | |
| #import requests | |
| import pdfplumber | |
| from pdf2image import convert_from_path | |
| #import pytesseract | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import fitz # PyMuPDF | |
| import multiprocessing | |
| num_cpus = multiprocessing.cpu_count() | |
| # ================== CONFIG ================== | |
| DEFAULT_API_KEY = [ | |
| "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8", | |
| # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ", | |
| # "AIzaSyDRWRwwnYJktCULH8d26mzD1Lv4l0CdQws", | |
| # "AIzaSyDW-x3kTWC7s2NJBOFDU7uC0vhKnREbANw", | |
| # "AIzaSyAq7Wsi6fR0oWrJQbFkgGNdvxJTn8hWEzQ", | |
| # "AIzaSyD0qjaoOJwrLeOz9Ko8Bi9vRgTy3AefTC8" | |
| ] | |
| key_index = 0 | |
| INTERNAL_MODEL_MAP = { | |
| "Gemini 2.5 Flash": "gemini-2.5-flash", | |
| "Gemini 2.5 Pro": "gemini-2.5-pro", | |
| } | |
| EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)" | |
| PROMPT_FREIGHT_HEADER_JSON = """Vui lòng trích xuất tất cả thông tin tiêu đề (header), và ghi chú bên ngoài bảng giá trong tài liệu. | |
| Trả lời bằng tiếng Việt, ngắn gọn, rõ ràng và trình bày theo dạng danh sách. | |
| Đặc biệt, cần xác định và chuẩn hóa ngày hiệu lực (valid from / to) theo văn bản trong tài liệu, tuân thủ chính xác các quy tắc định dạng ngày như sau: | |
| - DD/MM/YYYY, 01/MM/YYYY, 01/01/YYYY hoặc UFN nếu không có thông tin rõ ràng. | |
| - Nếu chi tiết các hãng không có ngày hiệu lực sẽ lấy thông tin trên header | |
| """ | |
| PROMPT_FREIGHT_JSON = """ | |
| Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure: | |
| { | |
| "shipping_line": "...", | |
| "shipping_line_code": "...", | |
| "shipping_line_reason": "Why this carrier is chosen?", | |
| "fee_type": "Air Freight", | |
| "valid_from": ..., | |
| "valid_to": ..., | |
| "charges": [ | |
| { | |
| "frequency": "...", | |
| "package_type": "...", | |
| "base_package_type": "...", | |
| "aircraft_type": "...", | |
| "direction": "Export or Import or null", | |
| "origin": "...", | |
| "destination": "...", | |
| "charge_name": "...", | |
| "charge_code": "...", | |
| "charge_code_reason": "...", | |
| "cargo_type": "...", | |
| "currency": "...", | |
| "transit": "...", | |
| "transit_time": "...", | |
| "additional_cost": ..., | |
| "weight_breaks": { | |
| "M": ..., | |
| "N": ..., | |
| "+45kg": ..., | |
| "+100kg": ..., | |
| "+300kg": ..., | |
| "+500kg": ..., | |
| "+1000kg": ..., | |
| "other": { key: value }, | |
| "weight_breaks_reason": "Why chosen weight_breaks?" | |
| }, | |
| "remark": "..." | |
| } | |
| ], | |
| "local_charges": [ | |
| { | |
| "charge_name": "...", | |
| "charge_code": "...", | |
| "unit": "...", | |
| "amount": ..., | |
| "remark": "..." | |
| } | |
| ] | |
| } | |
| ============================================================ | |
| ### DATE RULES | |
| ============================================================ | |
| - **valid_from** format: | |
| - DD/MM/YYYY (if full date) | |
| - 01/MM/YYYY (if month + year only) | |
| - 01/01/YYYY (if year only) | |
| - UFN if missing | |
| - **valid_to**: | |
| - exact DD/MM/YYYY if present | |
| - else: UFN | |
| ============================================================ | |
| ### STRICT DATA RULES | |
| ============================================================ | |
| - ONLY return a single JSON object. | |
| - All rates must match the weight break columns (M, N, +45kg, etc.). | |
| - Use `null` if value is missing. | |
| - "RQ" or similar → set as `"RQST"`. | |
| - Group destinations with same rate using "/". | |
| - Use IATA codes for `origin` and `destination`. | |
| - Ignore flight numbers like "ZH118" for charge_code. | |
| - Frequency format: | |
| - D[1-7] (e.g. D1, D2345, D1234567) | |
| - Local charges: must include if found. | |
| - Validity fields (`valid_from`, `valid_to`): use rules above. | |
| - Direction: Export if from Vietnam (SGN, HAN, DAD...), otherwise Import. | |
| - Provide plain English for `shipping_line_reason` and `charge_code_reason`. | |
| - Replace commas in remarks with semicolons. | |
| - RETURN ONLY JSON — no explanations. | |
| ============================================================ | |
| ### PACKAGE TYPE & SURCHARGE LOGIC | |
| ============================================================ | |
| - Always treat **Carton** as the base rate. | |
| - Generate derived **Pallet** (or SKID) surcharges if found in remarks/notes. | |
| ▶️ Rules: | |
| 1️⃣ **SKID shipment surcharge** | |
| If remark says: | |
| "SKID shipment: add 10 cents (apply for GEN & PER)" | |
| → Add surcharge line (+0.10 USD/kg) for Pallet GEN/PER. | |
| - Increase all weight breaks by that value. | |
| - Keep origin, destination, etc. unchanged. | |
| - Mention derivation in `remark`. | |
| 2️⃣ **Regional surcharge** | |
| E.g.: | |
| "For SKID shipment: EU +USD0.30/kg and rest +USD0.20/kg (exclude RGN, MAA)" | |
| → Generate 2 surcharge lines accordingly. | |
| 3️⃣ **Carton = Pallet** | |
| If remark says: | |
| "Carton = Pallet" | |
| → Copy Carton rates into Pallet. | |
| Set `additional_cost` = 0. | |
| 4️⃣ **As per remark** | |
| If remark says: | |
| "For specific route with package type: as per remark" | |
| → Parse to determine logic. | |
| ============================================================ | |
| ### DERIVED CHARGE GENERATION | |
| ============================================================ | |
| - Derived charges must be appended to `"charges"` array. | |
| - Must include: | |
| - `"package_type": "Pallet"` | |
| - `"base_package_type": "Carton"` | |
| - `"additional_cost"` = numeric surcharge | |
| - `"remark"` stating derivation | |
| - Other fields (origin, destination...) must match base record. | |
| - DO NOT remove the Carton base record. | |
| ============================================================ | |
| ### EXAMPLES | |
| ============================================================ | |
| Base: | |
| { | |
| "package_type": "Carton", | |
| "cargo_type": "GEN", | |
| "origin": "SGN", | |
| "destination": "NRT", | |
| "currency": "USD", | |
| "weight_breaks": { | |
| "+45kg": 6.05, | |
| "+100kg": 5.30, | |
| "+300kg": 4.80 | |
| }, | |
| "remark": "Carton base rate" | |
| } | |
| Derived (from SKID remark): | |
| { | |
| "package_type": "Pallet", | |
| "base_package_type": "Carton", | |
| "cargo_type": "GEN, PER", | |
| "currency": "USD", | |
| "origin": "SGN", | |
| "destination": "NRT", | |
| "additional_cost": 0.10, | |
| "weight_breaks": { | |
| "+45kg": 6.15, | |
| "+100kg": 5.40, | |
| "+300kg": 4.90 | |
| }, | |
| "remark": "Derived from Carton; SKID shipment: add 10 cents (apply for GEN & PER)" | |
| } | |
| """ | |
| # ================== HELPERS ================== | |
| def get_next_key(): | |
| global key_index | |
| key = DEFAULT_API_KEY[key_index % len(DEFAULT_API_KEY)] | |
| key_index += 1 | |
| return key | |
| def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc] | |
| def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes: | |
| if upload is None: | |
| raise ValueError("No file uploaded.") | |
| if isinstance(upload, (str, os.PathLike)): | |
| with open(upload, "rb") as f: | |
| return f.read() | |
| if isinstance(upload, dict) and "path" in upload: | |
| with open(upload["path"], "rb") as f: | |
| return f.read() | |
| if hasattr(upload, "read"): | |
| return upload.read() | |
| raise TypeError(f"Unsupported file object: {type(upload)}") | |
| def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]: | |
| filename = "upload.bin" | |
| if isinstance(file, (str, os.PathLike)): | |
| filename = os.path.basename(str(file)) | |
| elif isinstance(file, dict): | |
| filename = os.path.basename(file.get("name") or file.get("path", filename)) | |
| mime, _ = mimetypes.guess_type(filename) | |
| if not mime and file_bytes[:4] == b"%PDF": | |
| mime = "application/pdf" | |
| if not filename.lower().endswith(".pdf"): | |
| filename += ".pdf" | |
| return filename, mime or "application/octet-stream" | |
| def safe_parse_json(text: str): | |
| cleaned = re.sub(r"```json|```", "", text).strip() | |
| try: | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError as e: | |
| print(f"❌ Failed to parse JSON: {e}") | |
| print("📄 Raw text:\n", cleaned[:300]) | |
| return None | |
| def check_pdf_structure(file_bytes: bytes) -> str: | |
| """ | |
| Phân tích PDF xem thuộc loại: | |
| - 0: "1_trang_1_hang" | |
| - 1: "nhieu_trang_1_hang" | |
| - 2: "nhieu_hang" | |
| - "khong_xac_dinh": nếu có lỗi | |
| """ | |
| try: | |
| airline_pattern = re.compile(r"(.*?CARGO.*?RATE\s+EX\s+[A-Z]{3})", re.IGNORECASE) | |
| airline_headers = set() | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| if not text: | |
| continue | |
| for line in text.splitlines(): | |
| match = airline_pattern.search(line.strip()) | |
| if match: | |
| airline_name = match.group(1).strip().upper() | |
| airline_headers.add(airline_name) | |
| total_pages = len(pdf.pages) | |
| if len(airline_headers) > 1: | |
| return 2 | |
| elif total_pages > 1: | |
| return 1 | |
| else: | |
| return 0 | |
| except Exception as e: | |
| print(f"❌ Lỗi phân tích PDF: {e}") | |
| return "khong_xac_dinh" | |
| # ================== PDF CHECK & SPLIT ================== | |
| def split_excel_by_airline_header(excel_path, sheet_name=0): | |
| df = pd.read_excel(excel_path, header=None, sheet_name=sheet_name) | |
| airline_chunks = {} | |
| pattern = re.compile(r".*CARGO.*RATE EX HAN", re.IGNORECASE) | |
| start_indices, airline_names = [], [] | |
| for i, row in df.iterrows(): | |
| line = " ".join([str(cell) for cell in row if pd.notnull(cell)]) | |
| if pattern.match(line): | |
| start_indices.append(i) | |
| airline_names.append(line.strip()) | |
| start_indices.append(len(df)) | |
| for i in range(len(airline_names)): | |
| chunk_df = df.iloc[start_indices[i]:start_indices[i+1]].reset_index(drop=True) | |
| airline_chunks[airline_names[i]] = chunk_df | |
| return airline_chunks | |
| def export_pdf_to_excel(pdf_path: str, excel_output_path: str): | |
| all_data = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| tables = page.extract_tables() | |
| for table in tables: | |
| df = pd.DataFrame(table) | |
| df["__page__"] = page_num | |
| all_data.append(df) | |
| if all_data: | |
| final_df = pd.concat(all_data, ignore_index=True) | |
| final_df.to_excel(excel_output_path, index=False) | |
| # ================== PARALLEL ================== | |
| def send_to_gemini_for_json(df_chunk: pd.DataFrame, prompt: str, header: str) -> dict: | |
| print(f'Begin process {df_chunk}') | |
| table_text = df_chunk.to_csv(index=False) | |
| full_prompt = f"{prompt}\n\n Below is header and note {header}\nBelow is the table text (CSV):\n{table_text}\nReturn the JSON." | |
| result_text, _ = run_process_internal_base_v2( | |
| file_bytes=None, | |
| filename=None, | |
| mime=None, | |
| question=full_prompt, | |
| model_choice="Gemini 2.5 Flash", | |
| temperature=0.4, | |
| top_p=1.0 | |
| ) | |
| #print(f'End process {df_chunk}') | |
| return safe_parse_json(result_text) | |
| def process_all_chunks_with_threadpool(chunks: dict[str, pd.DataFrame], prompt: str, header: str, max_workers: int = 5) -> list[dict]: | |
| all_results = [] | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| futures = { | |
| executor.submit(send_to_gemini_for_json, chunk, prompt, header): airline | |
| for airline, chunk in chunks.items() #if re.match(r"^\\d+", airline.strip()) | |
| } | |
| for future in as_completed(futures): | |
| airline = futures[future] | |
| try: | |
| result = future.result() | |
| if result: | |
| all_results.extend(result if isinstance(result, list) else [result]) | |
| except Exception as e: | |
| print(f"❌ Error with {airline}: {e}") | |
| return all_results | |
| # ================== GEMINI BASE ================== | |
| def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3): | |
| api_key = get_next_key() | |
| genai.configure(api_key=api_key) | |
| model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash") | |
| print(f'Use key: {api_key}') | |
| user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON | |
| model = genai.GenerativeModel(model_name=model_name, | |
| generation_config={"temperature": float(temperature), "top_p": float(top_p)}) | |
| if file_bytes is None: | |
| response = model.generate_content(user_prompt) | |
| #print(response.text) | |
| return response.text, None | |
| pages = pdf_to_images(file_bytes) | |
| all_text_results = [] | |
| for i in range(0, len(pages), batch_size): | |
| batch = pages[i:i+batch_size] | |
| uploaded = [] | |
| for im in batch: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: | |
| im.save(tmp.name) | |
| up = genai.upload_file(path=tmp.name, mime_type="image/png") | |
| uploaded.append(genai.get_file(up.name)) | |
| resp = model.generate_content([user_prompt] + uploaded) | |
| all_text_results.append(resp.text if hasattr(resp, "text") else "") | |
| for up in uploaded: | |
| try: genai.delete_file(up.name) | |
| except: pass | |
| return "\n\n".join(all_text_results), None | |
| # ================== MAIN ROUTER ================== | |
| def run_process(file, question, model_choice, temperature, top_p, external_api_url): | |
| try: | |
| if file is None: | |
| return "ERROR: No file uploaded.", None | |
| file_bytes = _read_file_bytes(file) | |
| filename, mime = _guess_name_and_mime(file, file_bytes) | |
| # STEP 1️⃣: Check PDF structure | |
| if mime == "application/pdf" or file_bytes[:4] == b"%PDF": | |
| check_result = check_pdf_structure(file_bytes) | |
| all_dfs = [] | |
| saved_header = None | |
| if check_result > 1: | |
| print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.") | |
| base_name = os.path.splitext(filename)[0] | |
| tmp_dir = tempfile.gettempdir() | |
| # 🔁 Ghi file PDF tạm để xử lý | |
| tmp_pdf_path = os.path.join(tmp_dir, f"{base_name}.pdf") | |
| with open(tmp_pdf_path, "wb") as f: | |
| f.write(file_bytes) | |
| # 🔁 Tạo đường dẫn file Excel | |
| excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx") | |
| # 🛠 Gọi hàm xử lý | |
| export_pdf_to_excel(tmp_pdf_path, excel_path) | |
| chunks = split_excel_by_airline_header(excel_path) | |
| header, _ = run_process_internal_base_v2( | |
| file_bytes=file_bytes, | |
| filename=filename, | |
| mime=mime, | |
| question=PROMPT_FREIGHT_HEADER_JSON, | |
| model_choice=model_choice, | |
| temperature=temperature, | |
| top_p=top_p | |
| ) | |
| print(header) | |
| chunk_files = [] | |
| for airline, df_chunk in chunks.items(): | |
| safe_name = re.sub(r"[^\w\s]", "", airline).replace(" ", "_") | |
| print (f'airline : {airline}') | |
| result = process_all_chunks_with_threadpool(chunks, PROMPT_FREIGHT_JSON, header, 5) | |
| return json.dumps(result, ensure_ascii=False, indent=2), None | |
| else: | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page_idx, page in enumerate(pdf.pages, start=1): | |
| print(f"📄 Đang xử lý trang {page_idx}...") | |
| table = page.extract_table({ | |
| "vertical_strategy": "lines", | |
| "horizontal_strategy": "text", | |
| "snap_tolerance": 3, | |
| "intersection_tolerance": 5, | |
| }) | |
| if not table or len(table) < 2: | |
| print(f"⚠️ Trang {page_idx}: Không phát hiện bảng hợp lệ.") | |
| continue | |
| header = table[0] | |
| rows = table[1:] | |
| # Lưu header đầu tiên | |
| if saved_header is None: | |
| saved_header = header | |
| print(f"✅ Trang {page_idx}: Lưu header đầu tiên: {saved_header}") | |
| # Nếu trang sau không có header rõ → dùng header cũ | |
| if len(header) < len(saved_header) or "REGION" not in header[0]: | |
| print(f"↩️ Trang {page_idx}: Không có header rõ ràng, dùng lại header trước.") | |
| header = saved_header | |
| rows = table | |
| else: | |
| saved_header = header # cập nhật header hợp lệ | |
| if len(rows) == 0: | |
| print(f"⚠️ Trang {page_idx}: Không có dữ liệu dưới header.") | |
| continue | |
| try: | |
| df = pd.DataFrame(rows, columns=header) | |
| all_dfs.append(df) | |
| print(f"✅ Trang {page_idx}: {len(df)} dòng được thêm.") | |
| except Exception as e: | |
| print(f"❌ Lỗi tạo DataFrame ở trang {page_idx}: {e}") | |
| if all_dfs: | |
| final_df = pd.concat(all_dfs, ignore_index=True).dropna(how="all").reset_index(drop=True) | |
| print(f"✅ Tổng cộng {len(final_df)} dòng được trích xuất từ PDF.") | |
| # Xuất ra file tạm (Excel + JSON) | |
| base_name = os.path.splitext(filename)[0] | |
| tmp_dir = tempfile.gettempdir() | |
| # json_path = os.path.join(tmp_dir, f"{base_name}.json") | |
| excel_path = os.path.join(tmp_dir, f"{base_name}.xlsx") | |
| # final_df.to_json(json_path, orient="records", force_ascii=False, indent=2) | |
| final_df.to_excel(excel_path, index=False) | |
| # print(f"✅ Xuất JSON: {json_path}") | |
| # print(f"✅ Xuất Excel: {excel_path}") | |
| # Convert bảng thành CSV text để Gemini đọc tiếp | |
| table_text = final_df.to_csv(index=False) | |
| print(f"✅ Đang Gen text từ file CSV") | |
| question = ( | |
| f"{PROMPT_FREIGHT_JSON}\n" | |
| "Below is the table text extracted from the PDF (CSV format):\n" | |
| f"{table_text}\n\n" | |
| "Please convert this into valid JSON as per the schema." | |
| ) | |
| else: | |
| print("⚠️ Không có bảng hợp lệ để extract bằng pdfplumber.") | |
| result_text, _ = run_process_internal_base_v2( | |
| file_bytes=file_bytes, filename=filename, mime=mime, | |
| question=question, model_choice=model_choice, | |
| temperature=temperature, top_p=top_p | |
| ) | |
| return result_text, None | |
| except Exception as e: | |
| return f"ERROR: {type(e).__name__}: {str(e)}", None | |
| # ================== UI ================== | |
| def main(): | |
| with gr.Blocks(title="OCR Multi-Agent System") as demo: | |
| file = gr.File(label="Upload PDF/Image") | |
| question = gr.Textbox(label="Prompt", lines=2) | |
| model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME], | |
| value="Gemini 2.5 Flash", label="Model") | |
| temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05) | |
| top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01) | |
| external_api_url = gr.Textbox(label="External API URL", visible=False) | |
| output_text = gr.Code(label="Output", language="json") | |
| run_btn = gr.Button("🚀 Process") | |
| run_btn.click( | |
| run_process, | |
| inputs=[file, question, model_choice, temperature, top_p, external_api_url], | |
| outputs=[output_text, gr.State()] | |
| ) | |
| return demo | |
| demo = main() | |
| if __name__ == "__main__": | |
| demo.launch() | |