Spaces:

neerajkalyank
/

toshiba_2.O

Sleeping

neerajkalyank commited on Nov 12

Commit

a72b612

•

1 Parent(s): ea3f04a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,31 +1,21 @@
 import gradio as gr
-import pytesseract
 import pandas as pd
-from io import BytesIO
-import fitz  # PyMuPDF
 import re
-from PIL import Image
 import tempfile
-# Explicitly set the Tesseract path
-pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 def extract_data_from_pdf(pdf_file):
-    # Open the PDF file using the path provided by gr.File
-    doc = fitz.open(pdf_file.name)
     text_data = []
-    # Process each page in the PDF using Tesseract OCR
-    for page_num in range(doc.page_count):
-        page = doc[page_num]
-        pix = page.get_pixmap()  # Render page to a Pixmap image
-        # Convert Pixmap to PIL Image
-        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        # Use Tesseract to extract text from the image
-        text = pytesseract.image_to_string(image)
-        text_data.append(text)
     # Initialize list for parsed data
     data = []

 import gradio as gr
+import pdfplumber
 import pandas as pd
 import re
+from io import BytesIO
 import tempfile
 def extract_data_from_pdf(pdf_file):
+    # Initialize list to hold text from each page
     text_data = []
+    # Open the PDF file with pdfplumber
+    with pdfplumber.open(pdf_file) as pdf:
+        for page in pdf.pages:
+            # Extract text from each page
+            text = page.extract_text()
+            if text:
+                text_data.append(text)
     # Initialize list for parsed data
     data = []