Mbonea commited on
Commit
88b7f59
·
1 Parent(s): d0a7224
Files changed (3) hide show
  1. App/OCR/Tesseract.py +103 -0
  2. App/app.py +2 -1
  3. Dockerfile +2 -0
App/OCR/Tesseract.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import tempfile
4
+ import os
5
+ from fastapi.responses import JSONResponse
6
+ import pytesseract
7
+ from pytesseract import Output
8
+ from PIL import Image
9
+ import requests
10
+ from fastapi.routing import APIRouter
11
+ from io import BytesIO
12
+
13
+ tessaract_ocr_router = APIRouter(tags=["OCR"])
14
+
15
+
16
+ class HighlightRequest(BaseModel):
17
+ imageUrl: str
18
+ searchTerms: list[str]
19
+
20
+
21
+ @tessaract_ocr_router.post("/highlight")
22
+ async def highlight(request: HighlightRequest):
23
+ image_url = request.imageUrl
24
+ search_terms = request.searchTerms
25
+
26
+ if not image_url or not isinstance(search_terms, list) or len(search_terms) == 0:
27
+ raise HTTPException(
28
+ status_code=400, detail="imageUrl and searchTerms are required"
29
+ )
30
+
31
+ try:
32
+ # Download the image
33
+ response = requests.get(image_url)
34
+ if response.status_code != 200:
35
+ raise HTTPException(status_code=400, detail="Failed to download image")
36
+
37
+ image = Image.open(BytesIO(response.content))
38
+
39
+ # Run OCR
40
+ ocr_data = pytesseract.image_to_data(image, lang="eng", output_type=Output.DICT)
41
+ words = [
42
+ {
43
+ "text": ocr_data["text"][i],
44
+ "bbox": {
45
+ "x0": ocr_data["left"][i],
46
+ "y0": ocr_data["top"][i],
47
+ "x1": ocr_data["left"][i] + ocr_data["width"][i],
48
+ "y1": ocr_data["top"][i] + ocr_data["height"][i],
49
+ },
50
+ }
51
+ for i in range(len(ocr_data["text"]))
52
+ if ocr_data["text"][i].strip() != ""
53
+ ]
54
+
55
+ highlights = []
56
+
57
+ # Search for each term
58
+ for term in search_terms:
59
+ term_words = term.lower().split(" ")
60
+ term_len = len(term_words)
61
+
62
+ word_index = 0
63
+
64
+ for i, word_obj in enumerate(words):
65
+ word = word_obj["text"].lower()
66
+
67
+ if word == term_words[word_index]:
68
+ word_index += 1
69
+
70
+ # If all words match
71
+ if word_index == term_len:
72
+ word_index = 0
73
+
74
+ # Get bounding box
75
+ x_start = words[i - term_len + 1]["bbox"]["x0"]
76
+ y_start = words[i - term_len + 1]["bbox"]["y0"]
77
+ x_end = words[i]["bbox"]["x1"]
78
+ y_end = words[i]["bbox"]["y1"]
79
+
80
+ highlights.append(
81
+ {
82
+ "text": term,
83
+ "bbox": {
84
+ "x0": x_start,
85
+ "y0": y_start,
86
+ "x1": x_end,
87
+ "y1": y_end,
88
+ },
89
+ }
90
+ )
91
+ else:
92
+ word_index = 0 # Reset if match breaks
93
+
94
+ # Respond with highlights
95
+ return JSONResponse(
96
+ content={"searchTerms": search_terms, "highlights": highlights}
97
+ )
98
+
99
+ except Exception as e:
100
+ return HTTPException(
101
+ status_code=500,
102
+ detail=f"An error occurred while processing the image: {str(e)}",
103
+ )
App/app.py CHANGED
@@ -4,7 +4,7 @@ from fastapi.middleware.gzip import GZipMiddleware
4
 
5
  from .TTS.TTSRoutes import tts_router
6
  from .Embedding.EmbeddingRoutes import embeddigs_router
7
-
8
 
9
  from fastapi.middleware.cors import CORSMiddleware
10
 
@@ -46,5 +46,6 @@ async def landing_page():
46
 
47
 
48
  app.include_router(embeddigs_router)
 
49
  app.include_router(tts_router)
50
  # app.include_router(shader_router)
 
4
 
5
  from .TTS.TTSRoutes import tts_router
6
  from .Embedding.EmbeddingRoutes import embeddigs_router
7
+ from .OCR.Tesseract import tessaract_ocr_router
8
 
9
  from fastapi.middleware.cors import CORSMiddleware
10
 
 
46
 
47
 
48
  app.include_router(embeddigs_router)
49
+ app.include_router(tessaract_ocr_router)
50
  app.include_router(tts_router)
51
  # app.include_router(shader_router)
Dockerfile CHANGED
@@ -33,6 +33,8 @@ RUN pip install --no-cache-dir -r requirements.txt
33
  # Copy the application code
34
  USER admin
35
 
 
 
36
  COPY --chown=admin . /srv
37
 
38
  # Command to run the application
 
33
  # Copy the application code
34
  USER admin
35
 
36
+ RUN pip install pytesseract && apt install -y tesseract-ocr
37
+
38
  COPY --chown=admin . /srv
39
 
40
  # Command to run the application