BhagatSurya commited on
Commit
e1bd857
1 Parent(s): f772a38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -58
app.py CHANGED
@@ -1,67 +1,53 @@
1
  import gradio as gr
2
- import nltk
3
- import PyPDF2
4
- from nltk.corpus import stopwords
5
- from nltk.tokenize import word_tokenize
6
- import os
7
  import tempfile
8
-
9
- nltk.download('punkt')
10
- nltk.download('stopwords')
11
-
12
- # Preparing stop words
13
- stop_words = set(stopwords.words('english'))
 
 
 
 
 
 
14
 
15
  def clean_text(text):
16
- # Tokenizing the text
17
- word_tokens = word_tokenize(text)
18
-
19
- # Removing stop words
20
- filtered_text = [w for w in word_tokens if not w in stop_words]
21
-
22
- # Joining words
23
- filtered_text = " ".join(filtered_text)
24
-
25
- return filtered_text
26
-
27
  def pdf_to_text(file):
28
- # Open the PDF file
29
- pdf_file = open(file.name, 'rb')
30
-
31
- # Create PDF reader object
32
- pdf_reader = PyPDF2.PdfReader(pdf_file)
33
-
34
- full_text = ""
35
-
36
- # Read each page
37
- for i in range(len(pdf_reader.pages)):
38
- # Get the page
39
- page = pdf_reader.pages[i]
40
-
41
- # Extract text from the page
42
- text = page.extract_text()
43
-
44
- # Clean the text
45
- text = clean_text(text)
46
-
47
- # Add the text to the full text
48
- full_text += text + "\n\n----------\n\n" # Adding page break
49
-
50
- # Write the full_text to a temp file
51
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
52
- with open(temp_file.name, 'w') as f:
53
  f.write(full_text)
 
54
 
55
- # Close the PDF file
56
- pdf_file.close()
57
 
58
- # Return the file path
59
- return temp_file.name
60
-
61
- iface = gr.Interface(fn=pdf_to_text,
62
- inputs=gr.inputs.File(type="file", label="Your PDF"),
63
- outputs=gr.outputs.File(), # Changing the output to File type
64
- title="PDF to TXT",
65
  description="Convert your PDF files to clean text")
66
-
67
- iface.launch(share=True)
 
1
  import gradio as gr
 
 
 
 
 
2
  import tempfile
3
+ import re
4
+ from PyPDF2 import PdfReader, PdfFileReader
5
+ import os
6
+ import spacy
7
+ import pytesseract
8
+ import pdf2image
9
+ import subprocess
10
+ from pdf2image.exceptions import (
11
+ PDFInfoNotInstalledError,
12
+ PDFPageCountError,
13
+ PDFSyntaxError
14
+ )
15
 
16
  def clean_text(text):
17
+ nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
18
+ text = re.sub(r'\n+', '\n', text)
19
+ text = re.sub(r'\s+', ' ', text)
20
+ return text.strip()
21
+
22
+ def image_to_latex(image):
23
+ image_path = "/tmp/equation.png" # Modify as needed
24
+ image.save(image_path)
25
+ result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
26
+ return result.stdout
 
27
  def pdf_to_text(file):
28
+ with open(file.name, 'rb') as f:
29
+ reader = PdfReader(f)
30
+ full_text = ''
31
+ for i, page in enumerate(reader.pages):
32
+ page_text = page.extract_text()
33
+ if page_text is None:
34
+ images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
35
+ for image in images:
36
+ page_text = image_to_latex(image)
37
+ page_text = clean_text(page_text)
38
+ if len(page_text.split()) > 5:
39
+ page_text = "## Metadata: Page Number " + str(i+1) + "\n" + page_text
40
+ full_text += page_text + "\n\n"
41
+ base_name = os.path.splitext(os.path.basename(file.name))[0]
42
+ output_file_name = base_name + ".txt"
43
+ with open(output_file_name, 'w') as f:
 
 
 
 
 
 
 
 
 
44
  f.write(full_text)
45
+ return output_file_name
46
 
 
 
47
 
48
+ iface = gr.Interface(fn=pdf_to_text,
49
+ inputs=gr.inputs.File(label="Your PDF"),
50
+ outputs=gr.outputs.File(label="Download TXT"),
51
+ title="PDF to TXT",
 
 
 
52
  description="Convert your PDF files to clean text")
53
+ iface.launch()