BhagatSurya commited on
Commit
26072cc
1 Parent(s): 5a9a58b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -18
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import tempfile
3
  import re
4
- from PyPDF2 import PdfReader, PdfFileReader
5
  import os
6
  import spacy
7
  import pytesseract
@@ -12,6 +11,9 @@ from pdf2image.exceptions import (
12
  PDFPageCountError,
13
  PDFSyntaxError
14
  )
 
 
 
15
 
16
  def clean_text(text):
17
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
@@ -25,30 +27,29 @@ def image_to_latex(image):
25
  result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
26
  return result.stdout
27
 
28
-
29
  def pdf_to_text(file):
30
- with open(file.name, 'rb') as f:
31
- reader = PdfReader(f)
32
- full_text = ''
33
- for i, page in enumerate(reader.pages):
34
- page_text = page.extract_text()
35
- if page_text is None:
36
- images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
37
- for image in images:
38
- page_text = image_to_latex(image)
39
- page_text = clean_text(page_text)
40
- if len(page_text.split()) > 5:
41
- page_number = i + 1
42
- page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
43
- full_text += page_text + "\n\n"
 
 
44
  base_name = os.path.splitext(os.path.basename(file.name))[0]
45
  output_file_name = base_name + ".txt"
46
  with open(output_file_name, 'w') as f:
47
  f.write(full_text)
48
  return output_file_name, page_number
49
 
50
-
51
-
52
  iface = gr.Interface(fn=pdf_to_text,
53
  inputs=gr.inputs.File(label="Your PDF"),
54
  outputs=gr.outputs.File(label="Download TXT"),
 
1
  import gradio as gr
2
  import tempfile
3
  import re
 
4
  import os
5
  import spacy
6
  import pytesseract
 
11
  PDFPageCountError,
12
  PDFSyntaxError
13
  )
14
+ import fitz # PyMuPDF
15
+ from PIL import Image
16
+ import io
17
 
18
  def clean_text(text):
19
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
 
27
  result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
28
  return result.stdout
29
 
 
30
  def pdf_to_text(file):
31
+ doc = fitz.open(file.name)
32
+ full_text = ''
33
+ for i, page in enumerate(doc):
34
+ page_text = page.getText()
35
+ images = page.getImageList()
36
+ if images:
37
+ for image in images:
38
+ xref = image[0]
39
+ base_image = doc.extract_image(xref)
40
+ image = Image.open(io.BytesIO(base_image["image"]))
41
+ page_text += image_to_latex(image)
42
+ page_text = clean_text(page_text)
43
+ if len(page_text.split()) > 5:
44
+ page_number = i + 1
45
+ page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
46
+ full_text += page_text + "\n\n"
47
  base_name = os.path.splitext(os.path.basename(file.name))[0]
48
  output_file_name = base_name + ".txt"
49
  with open(output_file_name, 'w') as f:
50
  f.write(full_text)
51
  return output_file_name, page_number
52
 
 
 
53
  iface = gr.Interface(fn=pdf_to_text,
54
  inputs=gr.inputs.File(label="Your PDF"),
55
  outputs=gr.outputs.File(label="Download TXT"),