bainskarman commited on
Commit
a5c070f
·
verified ·
1 Parent(s): 50ee7cd

Update convert.py

Browse files
Files changed (1) hide show
  1. convert.py +8 -16
convert.py CHANGED
@@ -1,5 +1,4 @@
1
- import fitz
2
- from io import BytesIO
3
  import streamlit as st
4
 
5
  def ExtractPDFText(pdf):
@@ -7,21 +6,14 @@ def ExtractPDFText(pdf):
7
  pdf_bytes = pdf.read()
8
 
9
  try:
10
- pdf_document = fitz.open("dummy.pdf", pdf_bytes)
11
-
12
- # Iterate through pages and extract text
13
- for page_number in range(pdf_document.page_count):
14
- page = pdf_document[page_number]
15
- text = page.get_text()
16
- content += text
17
 
18
  except Exception as e:
19
  st.error(f"Error extracting text from PDF: {e}")
20
-
21
- finally:
22
- if "pdf_document" in locals():
23
- pdf_document.close()
24
-
25
- return content
26
-
27
 
 
 
1
+ import pdfplumber
 
2
  import streamlit as st
3
 
4
  def ExtractPDFText(pdf):
 
6
  pdf_bytes = pdf.read()
7
 
8
  try:
9
+ # Using pdfplumber to read the PDF bytes
10
+ with pdfplumber.open(BytesIO(pdf_bytes)) as pdf_document:
11
+ # Iterate through pages and extract text
12
+ for page in pdf_document.pages:
13
+ text = page.extract_text()
14
+ content += text if text else ""
 
15
 
16
  except Exception as e:
17
  st.error(f"Error extracting text from PDF: {e}")
 
 
 
 
 
 
 
18
 
19
+ return content