Instructions to use pemix09/paperstack_document_data_retrieval with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TF-Keras
How to use pemix09/paperstack_document_data_retrieval with TF-Keras:
# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy) # See https://github.com/keras-team/tf-keras for more details. from huggingface_hub import from_pretrained_keras model = from_pretrained_keras("pemix09/paperstack_document_data_retrieval") - Notebooks
- Google Colab
- Kaggle
| import os | |
| import json | |
| import pytesseract | |
| from pathlib import Path | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| from langchain_ollama import OllamaLLM | |
| # --- KONFIGURACJA --- | |
| pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' | |
| # Folder wejściowy | |
| INPUT_DIR = "scans" | |
| HISTORY_FILE = "processed_real_scans_files.txt" # Plik z listą zrobionych skanów | |
| MODEL_NAME = "llama3" | |
| # Definicja języków | |
| TARGET_LANGUAGES = { | |
| "pl": "Polish", | |
| "en": "English", | |
| "de": "German", | |
| "fr": "French", | |
| "es": "Spanish", | |
| "it": "Italian", | |
| "uk": "Ukrainian" | |
| } | |
| llm = OllamaLLM(model=MODEL_NAME, temperature=0) | |
| # NOWA, SKONSOLIDOWANA LISTA TYPÓW (zgodna z nowym Enumem) | |
| ALLOWED_TYPES = [ | |
| # Financial | |
| "taxDocument", "invoice", "receipt", "utilityBill", "bankStatement", | |
| "loanAgreement", "insurancePolicy", | |
| # Legal | |
| "notarialDeed", "courtDocument", "powerOfAttorney", "contract", | |
| # Personal | |
| "idCard", "passport", "birthCertificate", "marriageCertificate", | |
| "deathCertificate", "officialCertificate", "drivingLicense", | |
| "educationDocument", "cv", | |
| # Health | |
| "medicalDocument", "prescription", "referral", "vaccinationCard", | |
| "sanitaryBooklet", | |
| # Property | |
| "propertyDeed", "rentalAgreement", "vehicleDocument", "technicalInspection", | |
| # Other | |
| "documentScan", "application", "certificate", "other" | |
| ] | |
| # --- OBSŁUGA HISTORII (RESUME) --- | |
| def load_history(): | |
| """Wczytuje listę przetworzonych plików do setu (dla szybkiego wyszukiwania).""" | |
| if not os.path.exists(HISTORY_FILE): | |
| return set() | |
| with open(HISTORY_FILE, 'r', encoding='utf-8') as f: | |
| return set(line.strip() for line in f if line.strip()) | |
| def mark_as_done(rel_path): | |
| """Dopisuje plik do historii.""" | |
| with open(HISTORY_FILE, 'a', encoding='utf-8') as f: | |
| f.write(f"{rel_path}\n") | |
| # --- OCR I LLM --- | |
| def perform_ocr(file_path): | |
| text = "" | |
| try: | |
| langs = 'pol+eng' | |
| if file_path.suffix.lower() == ".pdf": | |
| pages = convert_from_path(file_path) | |
| for page in pages: | |
| text += pytesseract.image_to_string(page, lang=langs) | |
| else: | |
| text = pytesseract.image_to_string(Image.open(file_path), lang=langs) | |
| except Exception as e: | |
| print(f" [!] Błąd OCR: {file_path.name}: {e}") | |
| return text | |
| def ask_llm_json(prompt): | |
| try: | |
| response = llm.invoke(prompt) | |
| clean = response.replace("```json", "").replace("```", "").strip() | |
| start, end = clean.find('{'), clean.rfind('}') + 1 | |
| return json.loads(clean[start:end]) | |
| except Exception: | |
| return None | |
| def ask_llm_text(prompt): | |
| try: | |
| response = llm.invoke(prompt) | |
| return response.strip().strip('"').strip("'") | |
| except Exception: | |
| return "Translation Error" | |
| # --- LOGIKA PRZETWARZANIA --- | |
| def get_core_metadata(text, hinted_type=None): | |
| print(" 🧠 Analiza struktury dokumentu (Core Metadata)...") | |
| # Jeśli folder sugeruje typ, przekaż go jako wskazówkę | |
| hint_str = "" | |
| if hinted_type in ALLOWED_TYPES: | |
| hint_str = f"Strong Hint: The document is likely located in folder '{hinted_type}'." | |
| prompt = f""" | |
| Analyze the following document text. | |
| {hint_str} | |
| Extract structured data. | |
| RULES: | |
| 1. 'summary_base': Write a factual summary in ENGLISH (5 sentences). | |
| 2. 'title_base': Write a title in ENGLISH format: "[Specific Type] - [Entity] - [Date]". | |
| (e.g., "Tax Document (PIT-11) - Employer Name - 2023") | |
| 3. 'category': Must be one of: financial, legal, personal, health, property, other. | |
| 4. 'type': Choose the BEST MATCH from this specific list: {", ".join(ALLOWED_TYPES)}. | |
| 5. 'info': Specific details (e.g. "PIT-11", "Umowa o pracę", "Prąd"). | |
| Return ONLY JSON: | |
| {{ | |
| "title_base": "...", | |
| "summary_base": "...", | |
| "category": "...", | |
| "type": "...", | |
| "info": "..." | |
| }} | |
| TEXT: | |
| {text[:4000]} | |
| """ | |
| return ask_llm_json(prompt) | |
| def translate_section(text, target_lang, content_type="text"): | |
| prompt = f""" | |
| Translate the following {content_type} into {target_lang}. | |
| Output ONLY the translation. No explanations. No markdown. | |
| TEXT TO TRANSLATE: | |
| {text} | |
| """ | |
| return ask_llm_text(prompt) | |
| def save_file(root_folder, lang_code, sub_dir, filename, content): | |
| path = Path(root_folder) / lang_code / sub_dir | |
| path.mkdir(parents=True, exist_ok=True) | |
| with open(path / filename, "w", encoding="utf-8") as f: | |
| f.write(str(content)) | |
| def save_meta(root_folder, sub_dir, filename, content): | |
| path = Path(root_folder) / sub_dir | |
| path.mkdir(parents=True, exist_ok=True) | |
| with open(path / filename, "w", encoding="utf-8") as f: | |
| f.write(str(content)) | |
| def process_file(file_path, input_root): | |
| rel_path = file_path.relative_to(input_root) | |
| rel_path_str = str(rel_path) # Klucz do pliku historii | |
| base_filename = rel_path.stem + ".txt" | |
| sub_dir = rel_path.parent | |
| hinted_type = sub_dir.name if sub_dir.name != input_root.name else None | |
| # 1. OCR | |
| raw_text = perform_ocr(file_path) | |
| if not raw_text.strip(): | |
| print(" ⚠️ Pusty OCR - oznaczam jako przetworzony (bez wyników).") | |
| mark_as_done(rel_path_str) | |
| return | |
| # Zapisz oryginał (Content) - to zostaje, bo to dane wejściowe | |
| save_meta("content", sub_dir, base_filename, raw_text) | |
| # 2. Analiza podstawowa (Core) | |
| core_data = get_core_metadata(raw_text, hinted_type) | |
| if not core_data: | |
| print(" ❌ Błąd analizy AI. Przerywam dla tego pliku.") | |
| return | |
| # Zapisz dane niezależne od języka | |
| save_meta("category", sub_dir, base_filename, core_data.get("category", "other")) | |
| save_meta("type", sub_dir, base_filename, core_data.get("type", "other")) | |
| save_meta("info", sub_dir, base_filename, core_data.get("info", "none")) | |
| base_title = core_data.get("title_base", "Document") | |
| base_summary = core_data.get("summary_base", "No summary.") | |
| # 3. Pętla Tłumaczeń (TYLKO ETYKIETY) | |
| print(" 🌍 Rozpoczynam generowanie etykiet (tytuły/podsumowania)...") | |
| for code, lang_name in TARGET_LANGUAGES.items(): | |
| print(f" -> [{code.upper()}] {lang_name}...", end="", flush=True) | |
| # A. Tytuł | |
| if code == "en": | |
| final_title = base_title | |
| else: | |
| final_title = translate_section(base_title, lang_name, "title") | |
| save_file("titles", code, sub_dir, base_filename, final_title) | |
| # B. Streszczenie | |
| if code == "en": | |
| final_summary = base_summary | |
| else: | |
| final_summary = translate_section(base_summary, lang_name, "summary") | |
| save_file("summary", code, sub_dir, base_filename, final_summary) | |
| # C. Pełna treść - USUNIĘTO (Oszczędność czasu i tokenów) | |
| print(" OK.") | |
| # SUKCES! Dopiero tutaj zapisujemy do historii | |
| print(f"✅ Zakończono: {file_path.name}") | |
| mark_as_done(rel_path_str) | |
| def main(): | |
| input_root = Path(INPUT_DIR) | |
| if not input_root.exists(): | |
| print(f"Brak folderu wejściowego: {INPUT_DIR}") | |
| return | |
| # Wczytaj historię | |
| processed_files = load_history() | |
| print(f"📂 Załadowano historię: {len(processed_files)} plików już przetworzonych.") | |
| all_files = [f for f in input_root.rglob("*") if | |
| f.is_file() and f.suffix.lower() in [".pdf", ".jpg", ".png", ".jpeg"]] | |
| print(f"🚀 Znaleziono łącznie {len(all_files)} plików do analizy.") | |
| for f in all_files: | |
| rel_path_str = str(f.relative_to(input_root)) | |
| # Sprawdzenie w historii | |
| if rel_path_str in processed_files: | |
| print(f"⏩ Pomijam (już w historii): {rel_path_str}") | |
| continue | |
| print(f"\n📄 Przetwarzanie: {rel_path_str}") | |
| try: | |
| process_file(f, input_root) | |
| except KeyboardInterrupt: | |
| print("\n🛑 Zatrzymano przez użytkownika. Postęp zapisany.") | |
| break | |
| except Exception as e: | |
| print(f"\n❌ Krytyczny błąd dla {rel_path_str}: {e}") | |
| if __name__ == "__main__": | |
| main() |