Spaces:
Sleeping
Sleeping
import streamlit as st | |
import tempfile | |
import pytesseract | |
import PyPDF2 | |
from pdf2image import convert_from_path | |
from PIL import Image | |
def extract_text(file_path): | |
text = "" | |
image_text = "" | |
with open(file_path, "rb") as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
num_pages = len(pdf_reader.pages) | |
for page_number in range(num_pages): | |
# st.write(f"Page {page_number + 1}") | |
page = pdf_reader.pages[page_number] | |
text += page.extract_text() | |
images = convert_from_path(file_path) # Convert PDF pages to images | |
for i, image in enumerate(images): | |
# st.write(f"Page {i + 1}") | |
image_text += pytesseract.image_to_string(image) | |
# st.write("text") | |
# st.write(text) | |
# st.write("image_text") | |
# st.write(image_text) | |
text = text + image_text | |
st.write("plus") | |
st.write(text) # Display the extracted text from the image | |
def main(): | |
st.title("PDF Text Extractor") | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path | |
st.success("File successfully uploaded. Click below to extract text.") | |
st.button("Extract Text", on_click=extract_text, args=(temp_file.name,)) | |
if __name__ == "__main__": | |
main() |