neerajkalyank commited on
Commit
a72b612
1 Parent(s): ea3f04a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -20
app.py CHANGED
@@ -1,31 +1,21 @@
1
  import gradio as gr
2
- import pytesseract
3
  import pandas as pd
4
- from io import BytesIO
5
- import fitz # PyMuPDF
6
  import re
7
- from PIL import Image
8
  import tempfile
9
 
10
- # Explicitly set the Tesseract path
11
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
12
-
13
  def extract_data_from_pdf(pdf_file):
14
- # Open the PDF file using the path provided by gr.File
15
- doc = fitz.open(pdf_file.name)
16
  text_data = []
17
 
18
- # Process each page in the PDF using Tesseract OCR
19
- for page_num in range(doc.page_count):
20
- page = doc[page_num]
21
- pix = page.get_pixmap() # Render page to a Pixmap image
22
-
23
- # Convert Pixmap to PIL Image
24
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
25
-
26
- # Use Tesseract to extract text from the image
27
- text = pytesseract.image_to_string(image)
28
- text_data.append(text)
29
 
30
  # Initialize list for parsed data
31
  data = []
 
1
  import gradio as gr
2
+ import pdfplumber
3
  import pandas as pd
 
 
4
  import re
5
+ from io import BytesIO
6
  import tempfile
7
 
 
 
 
8
  def extract_data_from_pdf(pdf_file):
9
+ # Initialize list to hold text from each page
 
10
  text_data = []
11
 
12
+ # Open the PDF file with pdfplumber
13
+ with pdfplumber.open(pdf_file) as pdf:
14
+ for page in pdf.pages:
15
+ # Extract text from each page
16
+ text = page.extract_text()
17
+ if text:
18
+ text_data.append(text)
 
 
 
 
19
 
20
  # Initialize list for parsed data
21
  data = []