Spaces:

arjun-mavonic
/

scanned-pdf-text-extractor

Running

App Files Files Community

scanned-pdf-text-extractor / app.py

arjun-mavonic

chore(app): removed unused import

056c9af unverified 4 months ago

raw

history blame contribute delete

No virus

3 kB

	import os
	from docx import Document
	import streamlit as st
	from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text

	extracted_images_dir = "extracted_images"
	# Get the current directory
	current_dir = os.path.dirname(os.path.abspath(__file__))
	uploads_dir = os.path.join(current_dir, "uploads")
	os.makedirs(uploads_dir, exist_ok=True)
	converted_docx_dir = os.path.join(current_dir, "converted_docx")
	os.makedirs(converted_docx_dir, exist_ok=True)

	# Create a file uploader component
	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

	# Check if a file was uploaded
	if uploaded_file is not None:
	# Save the uploaded file to the uploads directory
	with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
	file.write(uploaded_file.getbuffer())
	st.success("File uploaded successfully!")
	else:
	st.info("Please upload a PDF file.")

	# Get a list of all PDF files in the uploads directory
	pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]

	# Create a column layout
	col1, col2 = st.columns(2)

	# Show checkboxes for each PDF file in col1
	with col1:
	selected_files = []
	for file in pdf_files:
	checkbox = st.checkbox(file)
	if checkbox:
	selected_files.append(file)

	# Check if any files are selected
	if selected_files:
	# Create a button to trigger the conversion process
	if st.button("Convert"):
	# Create or empty the extracted_images directory
	print(f"Creating or emptying the {extracted_images_dir} directory")
	create_or_empty_dir(extracted_images_dir)

	# Convert selected PDF files to images
	for file in selected_files:
	pdf_path = os.path.join(uploads_dir, file)
	print(f"Converting {file} to images in {extracted_images_dir}")
	convert_pdf_to_images(pdf_path, extracted_images_dir)
	# Create a Word document with text extracted from images
	output_docx = os.path.join(
	converted_docx_dir, f'{file.replace(".pdf", "")}.docx'
	)
	image_folder = os.path.join(current_dir, extracted_images_dir)
	print(
	f'Creating {file.replace(".pdf", "")}.docx with text extracted from images in the {extracted_images_dir}'
	)
	create_docx_with_text(image_folder, output_docx)

	st.success("Conversion completed successfully!")

	# Show documents from the converted_docx folder in col2
	with col2:
	docx_files = [
	file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")
	]
	for file in docx_files:
	st.download_button(
	f"Download {file}",
	open(os.path.join(converted_docx_dir, file), "rb").read(),
	file_name=file,
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	)