Upload 14 files

Browse files

Files changed (14) hide show

.gitattributes +1 -35
LICENSE +21 -0
README.md +49 -3
app.py +63 -0
config.yaml +10 -0
entity_tagger.py +12 -0
examples/demo_commands.txt +15 -0
examples/invoice_sample.pdf +6 -0
models/layoutlm_processor.py +8 -0
ocr_extractor.py +43 -0
pdf_loader.py +10 -0
requirements.txt +12 -0
summarize_doc.py +36 -0
utils.py +15 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ * text=auto

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 hmnshudhmn24
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,49 @@
----
-license: apache-2.0
----

+---
+language:
+  - en
+license: mit
+tags:
+  - document-question-answering
+  - ocr
+  - summarization
+  - document-ai
+pipeline_tag: document-question-answering
+model_name: docintel
+---
+# 🧾 DOCINTEL — Document AI (Donut-based)
+**DOCINTEL** extracts structured insights from scanned PDFs and images using **naver-clova-ix/donut-base** (Donut). It supports OCR fallback, entity extraction, and document summarization via Donut on page images.
+> ⚠️ Install system dependencies: `poppler` and `tesseract` for pdf2image and pytesseract respectively.
+## Quickstart
+1. Create venv & install dependencies:
+```bash
+python -m venv venv
+source venv/bin/activate      # Windows: venv\Scripts\activate
+pip install -r requirements.txt
+```
+2. Run API server:
+```bash
+uvicorn app:app --host 0.0.0.0 --port 8000
+```
+3. Upload a PDF and call endpoints (see examples/demo_commands.txt).
+## Files
+- `ocr_extractor.py` — PDF→images→OCR pipeline
+- `pdf_loader.py` — extract embedded text from PDFs
+- `entity_tagger.py` — regex-based entity extraction
+- `summarize_doc.py` — DONUT-based summarizer for page images
+- `app.py` — FastAPI server with upload/summary endpoints
+## Notes
+- Donut requires vision-encoder-decoder inference which may need GPU for speed.
+- For text-only PDFs consider using `extract_text_from_pdf` then a text summarizer instead of Donut.
+- This repo is a prototype/demo. Validate on your data before production use.
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""FastAPI app for DOCINTEL: upload PDF, extract text/OCR, get entities, summarize."""
+import os, uuid, tempfile
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from utils import ensure_dir, load_config, save_json
+from ocr_extractor import extract_full_text, pdf_to_images
+from entity_tagger import extract_entities
+from summarize_doc import summarize_image, summarize_text
+app = FastAPI(title='DOCINTEL API')
+cfg = load_config()
+STORAGE = cfg.get('storage_dir', './storage')
+ensure_dir(STORAGE)
+class QARequest(BaseModel):
+    question: str
+@app.post('/upload_pdf')
+async def upload_pdf(file: UploadFile = File(...)):
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail='Only PDF files are allowed')
+    doc_id = str(uuid.uuid4())
+    save_path = os.path.join(STORAGE, f"{doc_id}_{file.filename}")
+    with open(save_path, 'wb') as f:
+        f.write(await file.read())
+    return {'doc_id': doc_id, 'filename': file.filename, 'path': save_path}
+@app.get('/doc/{doc_id}/text')
+def get_text(doc_id: str):
+    files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
+    if not files:
+        raise HTTPException(status_code=404, detail='Document not found')
+    path = os.path.join(STORAGE, files[0])
+    text, ocr_pages = extract_full_text(path)
+    return {'doc_id': doc_id, 'text': text, 'ocr_pages_count': len(ocr_pages)}
+@app.get('/doc/{doc_id}/entities')
+def get_entities(doc_id: str):
+    files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
+    if not files:
+        raise HTTPException(status_code=404, detail='Document not found')
+    path = os.path.join(STORAGE, files[0])
+    text, _ = extract_full_text(path)
+    ents = extract_entities(text)
+    return JSONResponse(content={'doc_id': doc_id, 'entities': ents})
+@app.post('/doc/{doc_id}/summarize')
+def post_summarize(doc_id: str):
+    files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
+    if not files:
+        raise HTTPException(status_code=404, detail='Document not found')
+    path = os.path.join(STORAGE, files[0])
+    # convert to images and summarize first page with DONUT
+    pages = pdf_to_images(path, out_dir=tempfile.mkdtemp())
+    if not pages:
+        text, _ = extract_full_text(path)
+        summary = summarize_text(text)
+        return {'doc_id': doc_id, 'summary': summary}
+    # use first page image
+    summary = summarize_image(pages[0])
+    return {'doc_id': doc_id, 'summary': summary}

config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+model:
+  name: "naver-clova-ix/donut-base"
+  task: "document-question-answering"
+ocr:
+  lang: "eng"
+  dpi: 300
+server:
+  host: "0.0.0.0"
+  port: 8000
+storage_dir: "./storage"

entity_tagger.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Simple regex-based entity extraction for demo purposes."""
+import re
+def extract_entities(text):
+    entities = {}
+    emails = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", text)
+    dates = re.findall(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", text)
+    amounts = re.findall(r"\b\$?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?\s?(?:USD|INR|EUR|Rs|\$)?\b", text)
+    entities['emails'] = list(dict.fromkeys(emails))
+    entities['dates'] = list(dict.fromkeys(dates))
+    entities['amounts'] = list(dict.fromkeys([a.strip() for a in amounts if a.strip()]))
+    return entities

examples/demo_commands.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Example commands for DOCINTEL
+# 1) Start API server
+uvicorn app:app --host 0.0.0.0 --port 8000
+# 2) Upload PDF
+curl -X POST "http://127.0.0.1:8000/upload_pdf" -F "file=@examples/invoice_sample.pdf"
+# 3) Get extracted text
+curl "http://127.0.0.1:8000/doc/<DOC_ID>/text"
+# 4) Get entities
+curl "http://127.0.0.1:8000/doc/<DOC_ID>/entities"
+# 5) Summarize document
+curl -X POST "http://127.0.0.1:8000/doc/<DOC_ID>/summarize"

examples/invoice_sample.pdf ADDED Viewed

	@@ -0,0 +1,6 @@

+%PDF-1.4
+%\xe2\xe3\xcf\xd3
+1 0 obj<<>>endobj
+trailer
+<<>>
+%%EOF

models/layoutlm_processor.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""LayoutLM helper (optional) - provided for completeness but not used by default.
+"""
+from transformers import LayoutLMv3Processor, LayoutLMv3ForQuestionAnswering
+def load_layoutlm(model_name='microsoft/layoutlmv3-base'):
+    proc = LayoutLMv3Processor.from_pretrained(model_name)
+    model = LayoutLMv3ForQuestionAnswering.from_pretrained(model_name)
+    return proc, model

ocr_extractor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""OCR extraction using pdf2image + pytesseract for scanned pages."""
+from pdf2image import convert_from_path
+import pytesseract
+from PIL import Image
+from utils import load_config
+import os
+def pdf_to_images(pdf_path, dpi=None, out_dir=None):
+    cfg = load_config()
+    dpi = dpi or cfg.get('ocr', {}).get('dpi', 300)
+    pages = convert_from_path(pdf_path, dpi=dpi)
+    paths = []
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    for i, img in enumerate(pages, start=1):
+        path = os.path.join(out_dir or '.', f'page_{i}.png')
+        img.save(path, 'PNG')
+        paths.append(path)
+    return paths
+def ocr_image(path, lang=None):
+    cfg = load_config()
+    lang = lang or cfg.get('ocr', {}).get('lang', 'eng')
+    img = Image.open(path)
+    text = pytesseract.image_to_string(img, lang=lang)
+    return text
+def extract_full_text(pdf_path, do_ocr=True):
+    # Try embedded text first
+    try:
+        from pdf_loader import extract_text_from_pdf
+        txt = extract_text_from_pdf(pdf_path)
+        if txt and len(txt) > 200:
+            return txt, []  # return text and empty ocr pages list
+    except Exception:
+        txt = ''
+    # fallback to OCR
+    pages = pdf_to_images(pdf_path, out_dir='./temp_pages')
+    ocr_texts = []
+    for p in pages:
+        ocr_texts.append(ocr_image(p))
+    full = '\n\n'.join(ocr_texts)
+    return full, ocr_texts

pdf_loader.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""PDF text extraction using PyMuPDF (fitz) for embedded text layers."""
+import fitz
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    texts = []
+    for page in doc:
+        txt = page.get_text('text') or ''
+        texts.append(txt)
+    return '\n\n'.join(texts)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers>=4.30.0
+torch>=1.12.0
+pdf2image
+pytesseract
+Pillow
+PyMuPDF
+fastapi
+uvicorn[standard]
+python-multipart
+pyyaml
+requests
+tqdm

summarize_doc.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Summarization using DONUT (naver-clova-ix/donut-base) via Hugging Face.
+This module uses Donut's processor and VisionEncoderDecoderModel for docVQA-style prompts.
+"""
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+from PIL import Image
+from utils import load_config
+_processor = None
+_model = None
+def _init(model_name):
+    global _processor, _model
+    if _processor is None or _model is None:
+        _processor = DonutProcessor.from_pretrained(model_name)
+        _model = VisionEncoderDecoderModel.from_pretrained(model_name)
+    return _processor, _model
+def summarize_image(image_path, model_name=None, max_length=250):
+    cfg = load_config()
+    model_name = model_name or cfg.get('model', {}).get('name')
+    processor, model = _init(model_name)
+    image = Image.open(image_path).convert('RGB')
+    task_prompt = '<s_docvqa><s_question>Summarize the document:</s_question>'
+    inputs = processor(image, task_prompt, return_tensors='pt')
+    output = model.generate(**inputs, max_new_tokens=max_length)
+    decoded = processor.batch_decode(output, skip_special_tokens=True)[0]
+    return decoded
+def summarize_text(text, chunk_size=1000, model_name=None):
+    # naive: summarize by extracting first chunk and running model on placeholder image (not ideal for text-only)
+    # For text-heavy docs, use text summarization pipeline instead; here we return a simple extractive summary.
+    lines = [l.strip() for l in text.split('\n') if l.strip()]
+    if not lines:
+        return ''
+    summary = ' '.join(lines[:min(5, len(lines))])
+    return summary

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pathlib import Path
+import os, yaml, json
+def load_config(path='config.yaml'):
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f'Config not found: {path}')
+    return yaml.safe_load(p.read_text())
+def ensure_dir(path):
+    os.makedirs(path, exist_ok=True)
+def save_json(obj, path):
+    ensure_dir(Path(path).parent)
+    Path(path).write_text(json.dumps(obj, indent=2), encoding='utf-8')