Spaces:

BhagatSurya
/

convet_pdf_to_txt

Runtime error

App Files Files Community

BhagatSurya commited on Jun 20, 2023

Commit

e1bd857

•

1 Parent(s): f772a38

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -58

app.py CHANGED Viewed

@@ -1,67 +1,53 @@
 import gradio as gr
-import nltk
-import PyPDF2
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-import os
 import tempfile
-nltk.download('punkt')
-nltk.download('stopwords')
-# Preparing stop words
-stop_words = set(stopwords.words('english'))
 def clean_text(text):
-    # Tokenizing the text
-    word_tokens = word_tokenize(text)
-    # Removing stop words
-    filtered_text = [w for w in word_tokens if not w in stop_words]
-    # Joining words
-    filtered_text = " ".join(filtered_text)
-    return filtered_text
 def pdf_to_text(file):
-    # Open the PDF file
-    pdf_file = open(file.name, 'rb')
-    # Create PDF reader object
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    full_text = ""
-    # Read each page
-    for i in range(len(pdf_reader.pages)):
-        # Get the page
-        page = pdf_reader.pages[i]
-        # Extract text from the page
-        text = page.extract_text()
-        # Clean the text
-        text = clean_text(text)
-        # Add the text to the full text
-        full_text += text + "\n\n----------\n\n"  # Adding page break
-    # Write the full_text to a temp file
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
-    with open(temp_file.name, 'w') as f:
         f.write(full_text)
-    # Close the PDF file
-    pdf_file.close()
-    # Return the file path
-    return temp_file.name
-iface = gr.Interface(fn=pdf_to_text,
-                     inputs=gr.inputs.File(type="file", label="Your PDF"),
-                     outputs=gr.outputs.File(),  # Changing the output to File type
-                     title="PDF to TXT",
                      description="Convert your PDF files to clean text")
-iface.launch(share=True)

 import gradio as gr
 import tempfile
+import re
+from PyPDF2 import PdfReader, PdfFileReader
+import os
+import spacy
+import pytesseract
+import pdf2image
+import subprocess
+from pdf2image.exceptions import (
+    PDFInfoNotInstalledError,
+    PDFPageCountError,
+    PDFSyntaxError
+)
 def clean_text(text):
+    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
+    text = re.sub(r'\n+', '\n', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def image_to_latex(image):
+    image_path = "/tmp/equation.png"  # Modify as needed
+    image.save(image_path)
+    result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
+    return result.stdout
 def pdf_to_text(file):
+    with open(file.name, 'rb') as f:
+        reader = PdfReader(f)
+        full_text = ''
+        for i, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if page_text is None:
+                images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
+                for image in images:
+                    page_text = image_to_latex(image)
+            page_text = clean_text(page_text)
+            if len(page_text.split()) > 5:
+                page_text = "## Metadata: Page Number " + str(i+1) + "\n" + page_text
+                full_text += page_text + "\n\n"
+    base_name = os.path.splitext(os.path.basename(file.name))[0]
+    output_file_name = base_name + ".txt"
+    with open(output_file_name, 'w') as f:
         f.write(full_text)
+    return output_file_name
+iface = gr.Interface(fn=pdf_to_text,
+                     inputs=gr.inputs.File(label="Your PDF"),
+                     outputs=gr.outputs.File(label="Download TXT"),
+                     title="PDF to TXT",
                      description="Convert your PDF files to clean text")
+iface.launch()