Spaces:
Sleeping
Sleeping
File size: 1,473 Bytes
fb4537e 1d31e91 fb4537e 1d31e91 ba24d1a fb4537e 9108bfe fb4537e 1d31e91 0f776bc 5e91093 1d31e91 fb4537e 1d31e91 79de481 381c06c 020d5d0 18df6af 9108bfe 79de481 020d5d0 0f776bc 24e2eef d3bbe1d 24e2eef 020d5d0 0f776bc d3bbe1d 79de481 18df6af fb4537e 1d31e91 fb4537e 1d31e91 8855996 fb4537e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import streamlit as st
import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image
def extract_text(file_path):
text = ""
image_text = ""
with open(file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
for page_number in range(num_pages):
# st.write(f"Page {page_number + 1}")
page = pdf_reader.pages[page_number]
text += page.extract_text()
images = convert_from_path(file_path) # Convert PDF pages to images
for i, image in enumerate(images):
# st.write(f"Page {i + 1}")
image_text += pytesseract.image_to_string(image)
# st.write("text")
# st.write(text)
# st.write("image_text")
# st.write(image_text)
text = text + image_text
st.write("plus")
st.write(text) # Display the extracted text from the image
def main():
st.title("PDF Text Extractor")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
st.success("File successfully uploaded. Click below to extract text.")
st.button("Extract Text", on_click=extract_text, args=(temp_file.name,))
if __name__ == "__main__":
main() |