File size: 1,473 Bytes
fb4537e
1d31e91
fb4537e
1d31e91
ba24d1a
fb4537e
9108bfe
fb4537e
 
1d31e91
0f776bc
 
5e91093
1d31e91
 
 
fb4537e
1d31e91
79de481
381c06c
020d5d0
18df6af
9108bfe
 
79de481
020d5d0
0f776bc
24e2eef
 
d3bbe1d
24e2eef
 
020d5d0
0f776bc
d3bbe1d
79de481
18df6af
fb4537e
1d31e91
 
fb4537e
1d31e91
 
 
 
8855996
fb4537e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image



def extract_text(file_path):
    text = ""
    image_text = ""

    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_number in range(num_pages):
            # st.write(f"Page {page_number + 1}")
            page = pdf_reader.pages[page_number]
            text += page.extract_text()

    images = convert_from_path(file_path)  # Convert PDF pages to images
    for i, image in enumerate(images):
        # st.write(f"Page {i + 1}")
        image_text += pytesseract.image_to_string(image)
        
    # st.write("text")
    # st.write(text)

    # st.write("image_text")
    # st.write(image_text)

    text = text + image_text
    st.write("plus")
    st.write(text)  # Display the extracted text from the image

def main():
    st.title("PDF Text Extractor")
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
    if uploaded_file is not None:
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(uploaded_file.read())  # Save uploaded file to a temporary path
            st.success("File successfully uploaded. Click below to extract text.")
            st.button("Extract Text", on_click=extract_text, args=(temp_file.name,))


if __name__ == "__main__":
    main()