BhagatSurya commited on
Commit
2342442
1 Parent(s): 079df7d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import re
4
+ from PyPDF2 import PdfReader, PdfFileReader
5
+ import os
6
+ import spacy
7
+ import pytesseract
8
+ import pdf2image
9
+ import subprocess
10
+ from pdf2image.exceptions import (
11
+ PDFInfoNotInstalledError,
12
+ PDFPageCountError,
13
+ PDFSyntaxError
14
+ )
15
+
16
+ def clean_text(text):
17
+ nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
18
+ text = re.sub(r'\n+', '\n', text)
19
+ text = re.sub(r'\s+', ' ', text)
20
+ return text.strip()
21
+
22
+ def image_to_latex(image):
23
+ image_path = "/tmp/equation.png" # Modify as needed
24
+ image.save(image_path)
25
+ result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
26
+ return result.stdout
27
+
28
+ def pdf_to_text(file):
29
+ with open(file.name, 'rb') as f:
30
+ reader = PdfReader(f)
31
+ full_text = ''
32
+ for i, page in enumerate(reader.pages):
33
+ page_text = page.extract_text()
34
+ if page_text is None:
35
+ images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
36
+ for image in images:
37
+ page_text = image_to_latex(image)
38
+ page_text = clean_text(page_text)
39
+ if len(page_text.split()) > 5:
40
+ # Adding the page number as part of the page text
41
+ full_text += f"Page Number: {i+1}\n{page_text}\n\n"
42
+ base_name = os.path.splitext(os.path.basename(file.name))[0]
43
+ output_file_name = base_name + ".txt"
44
+ with open(output_file_name, 'w') as f:
45
+ f.write(full_text)
46
+ return output_file_name
47
+
48
+ iface = gr.Interface(fn=pdf_to_text,
49
+ inputs=gr.inputs.File(label="Your PDF"),
50
+ outputs=gr.outputs.File(label="Download TXT"),
51
+ title="PDF to TXT",
52
+ description="Convert your PDF files to clean text")
53
+ iface.launch()