Witold Wydmański commited on
Commit
02d986d
1 Parent(s): 6b6b8dc

feat: add tessdata

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +14 -6
  3. tessdata/pol.traineddata +3 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ tessdata/pol.traineddata filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -8,29 +8,32 @@ import logging
8
 
9
  logging.basicConfig(level=logging.INFO)
10
 
11
- def pdf_to_image(pdf_file, path, progress):
12
  # Convert the PDF to a PNG image using pdf2image
13
  doc = fitz.open(pdf_file.name) # open document
14
  fnames = []
15
  idx = 1
16
- for page in progress.tqdm(doc, desc="Converting PDF to image"):
 
17
  pix = page.get_pixmap()
18
  output = f"{path}/page-{idx}.png"
19
  pix.save(output)
20
  fnames.append(output)
21
  idx += 1
 
 
22
  return fnames
23
 
24
- def tesseract_ocr(image, progress=gr.Progress()):
25
  # Run OCR on the image using Tesseract
26
  with tempfile.TemporaryDirectory() as path:
27
- images = pdf_to_image(image, path, progress)
28
  text_res = []
29
  for img in progress.tqdm(images, desc="Running OCR"):
30
  with open(img, 'rb') as f:
31
  img = Image.open(f)
32
  img.load()
33
- text = pytesseract.image_to_string(img)
34
  text_res.append(text)
35
 
36
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
@@ -40,9 +43,14 @@ def tesseract_ocr(image, progress=gr.Progress()):
40
 
41
  if __name__ == "__main__":
42
  logging.info("Starting Tesseract OCR")
 
43
  iface = gr.Interface(
44
  fn=tesseract_ocr,
45
- inputs=[gr.File(label="PDF file")],
 
 
 
 
46
  outputs=gr.File(label="Text file", type="file"),
47
  title="PDF to Text Converter",
48
  description="Converts a PDF file to text using Tesseract OCR."
 
8
 
9
  logging.basicConfig(level=logging.INFO)
10
 
11
+ def pdf_to_image(pdf_file, path, progress, max_pages):
12
  # Convert the PDF to a PNG image using pdf2image
13
  doc = fitz.open(pdf_file.name) # open document
14
  fnames = []
15
  idx = 1
16
+ total = len(doc) if max_pages == 0 else max_pages
17
+ for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
18
  pix = page.get_pixmap()
19
  output = f"{path}/page-{idx}.png"
20
  pix.save(output)
21
  fnames.append(output)
22
  idx += 1
23
+ if max_pages > 0 and idx > max_pages:
24
+ break
25
  return fnames
26
 
27
+ def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
28
  # Run OCR on the image using Tesseract
29
  with tempfile.TemporaryDirectory() as path:
30
+ images = pdf_to_image(image, path, progress, max_pages)
31
  text_res = []
32
  for img in progress.tqdm(images, desc="Running OCR"):
33
  with open(img, 'rb') as f:
34
  img = Image.open(f)
35
  img.load()
36
+ text = pytesseract.image_to_string(img, lang=language)
37
  text_res.append(text)
38
 
39
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
 
43
 
44
  if __name__ == "__main__":
45
  logging.info("Starting Tesseract OCR")
46
+ os.environ["TESSDATA_PREFIX"] = "./tessdata"
47
  iface = gr.Interface(
48
  fn=tesseract_ocr,
49
+ inputs=[
50
+ gr.File(label="PDF file"),
51
+ gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
52
+ gr.Number(label="Number of pages", value=0)
53
+ ],
54
  outputs=gr.File(label="Text file", type="file"),
55
  title="PDF to Text Converter",
56
  description="Converts a PDF file to text using Tesseract OCR."
tessdata/pol.traineddata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10b5a77c4e865ccaa79984879457df8aea7b6b0caabd9a5860733d485c913634
3
+ size 25941386