Spaces:

Rivalcoder
/

Rapid-Extractor

Sleeping

App Files Files Community

Rivalcoder commited on Sep 24

Commit

632c507

1 Parent(s): 760d875

Add Files

Browse files

Files changed (4) hide show

Dockerfile +31 -0
__pycache__/main.cpython-312.pyc +0 -0
main.py +117 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.12-slim
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1
+# Install Tesseract + all language packs + deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr-all \
+    libtesseract-dev \
+    libgl1 \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Set workdir
+WORKDIR /app
+# Copy requirements first (for Docker caching)
+COPY requirements.txt .
+# Install Python deps
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+# Copy app
+COPY . .
+# Expose port
+EXPOSE 7860
+# Run API
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (6.25 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from PIL import Image
+import pytesseract
+import fitz
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+import cv2
+import numpy as np
+import io
+app = FastAPI(title="Fast Parallel Text Extract API")
+executor = ThreadPoolExecutor(max_workers=8)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # allow all origins for testing
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ---------- Utils ----------
+def read_image_from_bytes(file_bytes: bytes):
+    arr = np.frombuffer(file_bytes, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    return img
+def resize_if_large(img, max_dim=2000):
+    h, w = img.shape[:2]
+    if max(h, w) > max_dim:
+        scale = max_dim / max(h, w)
+        img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
+    return img
+# ---------- Fast OCR ----------
+def fast_ocr(file_bytes: bytes, lang: str = "eng"):
+    img_bgr = read_image_from_bytes(file_bytes)
+    if img_bgr is None:
+        return ""
+    img_bgr = resize_if_large(img_bgr)
+    # Light preprocessing
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    pil_img = Image.fromarray(gray)
+    config = "--oem 3 --psm 6"  # balanced speed + accuracy
+    text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
+    return text.strip()
+# ---------- Heavy OCR (fallback only) ----------
+def heavy_ocr(file_bytes: bytes, lang: str = "eng"):
+    img_bgr = read_image_from_bytes(file_bytes)
+    if img_bgr is None:
+        return ""
+    # Denoise + threshold (slower but more robust)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    gray = cv2.fastNlMeansDenoising(gray, None, h=10)
+    _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    pil_img = Image.fromarray(gray)
+    config = "--oem 3 --psm 6"
+    text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
+    return text.strip()
+# ---------- Image extraction ----------
+def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"):
+    text = fast_ocr(file_bytes, lang)
+    if len(text) < 20:
+        text = heavy_ocr(file_bytes, lang)
+    return text
+# ---------- PDF extraction ----------
+def extract_text_from_pdf_bytes(file_bytes: bytes):
+    doc = fitz.open(stream=file_bytes, filetype="pdf")
+    texts = []
+    for page in doc:
+        try:
+            texts.append(page.get_text("text"))
+        except Exception:
+            texts.append("")
+    return "\n".join(texts)
+# ---------- Endpoints ----------
+@app.post("/extract-image")
+async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")):
+    """
+    Extract text from image.
+    lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin'
+    """
+    try:
+        raw = await file.read()
+        loop = asyncio.get_event_loop()
+        text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang)
+        return JSONResponse({"text": text})
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+@app.post("/extract-pdf")
+async def extract_pdf(file: UploadFile = File(...)):
+    """
+    Extract text from PDF.
+    """
+    try:
+        raw = await file.read()
+        loop = asyncio.get_event_loop()
+        text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw)
+        return JSONResponse({"text": text})
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+pytesseract
+pillow
+PyMuPDF
+opencv-python-headless
+numpy