Witold Wydmański commited on
Commit
1506ae7
1 Parent(s): 02d986d

feat: replace pytesseract with tesserocr

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. app.py +11 -3
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -2,7 +2,7 @@ FROM python:3.10-slim
2
  WORKDIR /code
3
 
4
  # Install tesseract
5
- RUN apt-get update && apt-get install -y tesseract-ocr
6
 
7
  # Install python dependencies
8
  COPY requirements.txt .
 
2
  WORKDIR /code
3
 
4
  # Install tesseract
5
+ RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
6
 
7
  # Install python dependencies
8
  COPY requirements.txt .
app.py CHANGED
@@ -1,13 +1,19 @@
1
  import gradio as gr
2
  import tempfile
3
- import pytesseract
4
  import os
5
  import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
6
  from PIL import Image
7
  import logging
 
8
 
9
  logging.basicConfig(level=logging.INFO)
10
 
 
 
 
 
 
11
  def pdf_to_image(pdf_file, path, progress, max_pages):
12
  # Convert the PDF to a PNG image using pdf2image
13
  doc = fitz.open(pdf_file.name) # open document
@@ -25,6 +31,8 @@ def pdf_to_image(pdf_file, path, progress, max_pages):
25
  return fnames
26
 
27
  def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
 
 
28
  # Run OCR on the image using Tesseract
29
  with tempfile.TemporaryDirectory() as path:
30
  images = pdf_to_image(image, path, progress, max_pages)
@@ -33,7 +41,8 @@ def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
33
  with open(img, 'rb') as f:
34
  img = Image.open(f)
35
  img.load()
36
- text = pytesseract.image_to_string(img, lang=language)
 
37
  text_res.append(text)
38
 
39
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
@@ -43,7 +52,6 @@ def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
43
 
44
  if __name__ == "__main__":
45
  logging.info("Starting Tesseract OCR")
46
- os.environ["TESSDATA_PREFIX"] = "./tessdata"
47
  iface = gr.Interface(
48
  fn=tesseract_ocr,
49
  inputs=[
 
1
  import gradio as gr
2
  import tempfile
3
+ import tesserocr
4
  import os
5
  import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
6
  from PIL import Image
7
  import logging
8
+ from multiprocessing.pool import Pool
9
 
10
  logging.basicConfig(level=logging.INFO)
11
 
12
+ APIs = {
13
+ "pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"),
14
+ "eng": tesserocr.PyTessBaseAPI(),
15
+ }
16
+
17
  def pdf_to_image(pdf_file, path, progress, max_pages):
18
  # Convert the PDF to a PNG image using pdf2image
19
  doc = fitz.open(pdf_file.name) # open document
 
31
  return fnames
32
 
33
  def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
34
+ api = APIs[language]
35
+
36
  # Run OCR on the image using Tesseract
37
  with tempfile.TemporaryDirectory() as path:
38
  images = pdf_to_image(image, path, progress, max_pages)
 
41
  with open(img, 'rb') as f:
42
  img = Image.open(f)
43
  img.load()
44
+ api.SetImage(img)
45
+ text = api.GetUTF8Text()
46
  text_res.append(text)
47
 
48
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
 
52
 
53
  if __name__ == "__main__":
54
  logging.info("Starting Tesseract OCR")
 
55
  iface = gr.Interface(
56
  fn=tesseract_ocr,
57
  inputs=[
requirements.txt CHANGED
@@ -2,4 +2,5 @@ pytesseract
2
  pymupdf
3
  gradio
4
  pillow
5
- tqdm
 
 
2
  pymupdf
3
  gradio
4
  pillow
5
+ tqdm
6
+ tesserocr