pdf_reader_try / app.py
kxx-kkk's picture
Upload app.py
24e2eef verified
import streamlit as st
import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image
def extract_text(file_path):
text = ""
image_text = ""
with open(file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
for page_number in range(num_pages):
# st.write(f"Page {page_number + 1}")
page = pdf_reader.pages[page_number]
text += page.extract_text()
images = convert_from_path(file_path) # Convert PDF pages to images
for i, image in enumerate(images):
# st.write(f"Page {i + 1}")
image_text += pytesseract.image_to_string(image)
# st.write("text")
# st.write(text)
# st.write("image_text")
# st.write(image_text)
text = text + image_text
st.write("plus")
st.write(text) # Display the extracted text from the image
def main():
st.title("PDF Text Extractor")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
st.success("File successfully uploaded. Click below to extract text.")
st.button("Extract Text", on_click=extract_text, args=(temp_file.name,))
if __name__ == "__main__":
main()