Rivalcoder commited on
Commit
632c507
·
1 Parent(s): 760d875
Files changed (4) hide show
  1. Dockerfile +31 -0
  2. __pycache__/main.cpython-312.pyc +0 -0
  3. main.py +117 -0
  4. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ PYTHONUNBUFFERED=1
5
+
6
+ # Install Tesseract + all language packs + deps
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ tesseract-ocr-all \
9
+ libtesseract-dev \
10
+ libgl1 \
11
+ poppler-utils \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Set workdir
15
+ WORKDIR /app
16
+
17
+ # Copy requirements first (for Docker caching)
18
+ COPY requirements.txt .
19
+
20
+ # Install Python deps
21
+ RUN pip install --no-cache-dir --upgrade pip \
22
+ && pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy app
25
+ COPY . .
26
+
27
+ # Expose port
28
+ EXPOSE 7860
29
+
30
+ # Run API
31
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/main.cpython-312.pyc ADDED
Binary file (6.25 kB). View file
 
main.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from PIL import Image
5
+ import pytesseract
6
+ import fitz
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import asyncio
9
+ import cv2
10
+ import numpy as np
11
+ import io
12
+
13
+ app = FastAPI(title="Fast Parallel Text Extract API")
14
+ executor = ThreadPoolExecutor(max_workers=8)
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"], # allow all origins for testing
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+
25
+
26
+ # ---------- Utils ----------
27
+ def read_image_from_bytes(file_bytes: bytes):
28
+ arr = np.frombuffer(file_bytes, np.uint8)
29
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
30
+ return img
31
+
32
+ def resize_if_large(img, max_dim=2000):
33
+ h, w = img.shape[:2]
34
+ if max(h, w) > max_dim:
35
+ scale = max_dim / max(h, w)
36
+ img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
37
+ return img
38
+
39
+ # ---------- Fast OCR ----------
40
+ def fast_ocr(file_bytes: bytes, lang: str = "eng"):
41
+ img_bgr = read_image_from_bytes(file_bytes)
42
+ if img_bgr is None:
43
+ return ""
44
+
45
+ img_bgr = resize_if_large(img_bgr)
46
+
47
+ # Light preprocessing
48
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
49
+ _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
50
+
51
+ pil_img = Image.fromarray(gray)
52
+ config = "--oem 3 --psm 6" # balanced speed + accuracy
53
+ text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
54
+ return text.strip()
55
+
56
+ # ---------- Heavy OCR (fallback only) ----------
57
+ def heavy_ocr(file_bytes: bytes, lang: str = "eng"):
58
+ img_bgr = read_image_from_bytes(file_bytes)
59
+ if img_bgr is None:
60
+ return ""
61
+
62
+ # Denoise + threshold (slower but more robust)
63
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
64
+ gray = cv2.fastNlMeansDenoising(gray, None, h=10)
65
+ _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
66
+
67
+ pil_img = Image.fromarray(gray)
68
+ config = "--oem 3 --psm 6"
69
+ text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
70
+ return text.strip()
71
+
72
+ # ---------- Image extraction ----------
73
+ def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"):
74
+ text = fast_ocr(file_bytes, lang)
75
+ if len(text) < 20:
76
+ text = heavy_ocr(file_bytes, lang)
77
+ return text
78
+
79
+ # ---------- PDF extraction ----------
80
+ def extract_text_from_pdf_bytes(file_bytes: bytes):
81
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
82
+ texts = []
83
+ for page in doc:
84
+ try:
85
+ texts.append(page.get_text("text"))
86
+ except Exception:
87
+ texts.append("")
88
+ return "\n".join(texts)
89
+
90
+ # ---------- Endpoints ----------
91
+
92
+ @app.post("/extract-image")
93
+ async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")):
94
+ """
95
+ Extract text from image.
96
+ lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin'
97
+ """
98
+ try:
99
+ raw = await file.read()
100
+ loop = asyncio.get_event_loop()
101
+ text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang)
102
+ return JSONResponse({"text": text})
103
+ except Exception as e:
104
+ return JSONResponse({"error": str(e)}, status_code=500)
105
+
106
+ @app.post("/extract-pdf")
107
+ async def extract_pdf(file: UploadFile = File(...)):
108
+ """
109
+ Extract text from PDF.
110
+ """
111
+ try:
112
+ raw = await file.read()
113
+ loop = asyncio.get_event_loop()
114
+ text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw)
115
+ return JSONResponse({"text": text})
116
+ except Exception as e:
117
+ return JSONResponse({"error": str(e)}, status_code=500)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pytesseract
4
+ pillow
5
+ PyMuPDF
6
+ opencv-python-headless
7
+ numpy