Instructions to use pemix09/paperstack_document_data_retrieval with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TF-Keras
How to use pemix09/paperstack_document_data_retrieval with TF-Keras:
# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy) # See https://github.com/keras-team/tf-keras for more details. from huggingface_hub import from_pretrained_keras model = from_pretrained_keras("pemix09/paperstack_document_data_retrieval") - Notebooks
- Google Colab
- Kaggle
| import os | |
| import json | |
| from pathlib import Path | |
| from langchain_ollama import OllamaLLM | |
| # --- KONFIGURACJA --- | |
| INPUT_DIR = "synthetic_content" | |
| OUTPUT_ROOT = "synthetic_dataset" | |
| HISTORY_FILE = "processed_synthetic_scans_contents.txt" | |
| MODEL_NAME = "llama3" | |
| # Definicja języków | |
| TARGET_LANGUAGES = { | |
| "pl": "Polish", | |
| "en": "English", | |
| "de": "German", | |
| "fr": "French", | |
| "es": "Spanish", | |
| "it": "Italian", | |
| "uk": "Ukrainian" | |
| } | |
| # Inicjalizacja LLM z niską temperaturą dla powtarzalności | |
| llm = OllamaLLM(model=MODEL_NAME, temperature=0) | |
| # --- OBSŁUGA HISTORII (RESUME) --- | |
| def load_history(): | |
| if not os.path.exists(HISTORY_FILE): | |
| return set() | |
| with open(HISTORY_FILE, 'r', encoding='utf-8') as f: | |
| return set(line.strip() for line in f if line.strip()) | |
| def mark_as_done(rel_path): | |
| with open(HISTORY_FILE, 'a', encoding='utf-8') as f: | |
| f.write(f"{rel_path}\n") | |
| # --- PROMPTY LLM --- | |
| def ask_llm_json(prompt): | |
| """Wywołuje LLM w trybie JSON i bezpiecznie parsuje wynik.""" | |
| try: | |
| # format="json" to kluczowa funkcja Ollama, która wymusza poprawny JSON | |
| response = llm.invoke(prompt, format="json") | |
| return json.loads(response) | |
| except json.JSONDecodeError as e: | |
| print(f"\n ⚠️ Błąd składni JSON od AI: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"\n ⚠️ Błąd komunikacji z LLM: {e}") | |
| return None | |
| def ask_llm_text(prompt): | |
| try: | |
| response = llm.invoke(prompt) | |
| return response.strip().strip('"').strip("'") | |
| except Exception: | |
| return "Translation Error" | |
| def get_metadata(text, hinted_type): | |
| # Prompt z wyraźnymi instrukcjami dla formatu JSON | |
| prompt = f""" | |
| Analyze this document text. | |
| Folder hint: {hinted_type} | |
| Return ONLY a JSON object with these keys: | |
| - "title_base": Factual title in ENGLISH (format: "[Type] - [Entity] - [Date]") | |
| - "summary_base": Factual summary in ENGLISH (exactly 5 sentences) | |
| - "category": One of: financial, legal, personal, health, property, other | |
| - "info": Key details (e.g. document ID or service name) | |
| Ensure all quotes inside the text are properly escaped. | |
| TEXT: | |
| {text[:3500]} | |
| """ | |
| return ask_llm_json(prompt) | |
| def translate_section(text, target_lang, content_type="text"): | |
| prompt = f""" | |
| Translate the following {content_type} into {target_lang}. | |
| Output ONLY the translation. No conversational text or markdown. | |
| TEXT TO TRANSLATE: | |
| {text} | |
| """ | |
| return ask_llm_text(prompt) | |
| def save_output(root, kind, lang, subdir, filename, content): | |
| if lang: | |
| path = Path(root) / kind / lang / subdir | |
| else: | |
| path = Path(root) / kind / subdir | |
| path.mkdir(parents=True, exist_ok=True) | |
| with open(path / filename, "w", encoding="utf-8") as f: | |
| f.write(str(content)) | |
| # --- GŁÓWNA LOGIKA PLIKU --- | |
| def process_file(file_path, input_root): | |
| rel_path = file_path.relative_to(input_root) | |
| base_filename = rel_path.name | |
| sub_dir = rel_path.parent | |
| doc_type = sub_dir.name | |
| try: | |
| raw_text = file_path.read_text(encoding='utf-8') | |
| except Exception as e: | |
| print(f" ❌ Błąd odczytu pliku: {e}") | |
| return | |
| # 2. Generowanie metadanych (JSON) | |
| meta = get_metadata(raw_text, doc_type) | |
| if not meta or not isinstance(meta, dict): | |
| print(" ❌ Błąd AI: Nie udało się wygenerować poprawnego JSONa.") | |
| return | |
| # 3. Zapisywanie danych podstawowych | |
| save_output(OUTPUT_ROOT, "content", None, sub_dir, base_filename, raw_text) | |
| save_output(OUTPUT_ROOT, "category", None, sub_dir, base_filename, meta.get("category", "other")) | |
| save_output(OUTPUT_ROOT, "type", None, sub_dir, base_filename, doc_type) | |
| save_output(OUTPUT_ROOT, "info", None, sub_dir, base_filename, meta.get("info", "none")) | |
| base_title = meta.get("title_base", "Document") | |
| base_summary = meta.get("summary_base", "No summary available.") | |
| # 4. Tłumaczenia | |
| print(f" 🌍 Tłumaczenie na {len(TARGET_LANGUAGES)} języków...", end="", flush=True) | |
| for code, lang_name in TARGET_LANGUAGES.items(): | |
| # Tytuły | |
| if code == "en": | |
| title = base_title | |
| else: | |
| title = translate_section(base_title, lang_name, "title") | |
| save_output(OUTPUT_ROOT, "titles", code, sub_dir, base_filename, title) | |
| # Streszczenia | |
| if code == "en": | |
| summary = base_summary | |
| else: | |
| summary = translate_section(base_summary, lang_name, "summary") | |
| save_output(OUTPUT_ROOT, "summary", code, sub_dir, base_filename, summary) | |
| print(".", end="", flush=True) | |
| print(" OK") | |
| mark_as_done(str(rel_path)) | |
| def main(): | |
| input_path = Path(INPUT_DIR) | |
| if not input_path.exists(): | |
| print(f"❌ Brak folderu wejściowego: {INPUT_DIR}") | |
| return | |
| processed = load_history() | |
| print(f"📂 Historia: {len(processed)} plików już przetworzonych.") | |
| files = list(input_path.rglob("*.txt")) | |
| print(f"🚀 Start: {len(files)} plików do analizy.") | |
| for f in files: | |
| rel_path = str(f.relative_to(input_path)) | |
| if rel_path in processed: | |
| continue | |
| print(f"📄 Przetwarzam: {rel_path}") | |
| try: | |
| process_file(f, input_path) | |
| except KeyboardInterrupt: | |
| print("\n🛑 Przerwano ręcznie. Postęp zapisany.") | |
| break | |
| except Exception as e: | |
| print(f"\n❌ Błąd krytyczny przy pliku {rel_path}: {e}") | |
| if __name__ == "__main__": | |
| main() |