Spaces:

huggingchat
/

document-parser

Running on Zero

App Files Files Community

document-parser / app.py

Liam Dyer

feat: pdf and plain text support

6c400a9 unverified 2 months ago

raw

history blame

No virus

2.45 kB

	import gradio as gr
	import spaces
	import subprocess
	import os
	import string
	import random
	from pypdf import PdfReader
	import ocrmypdf


	def random_word(length):
	letters = string.ascii_lowercase
	return "".join(random.choice(letters) for _ in range(length))


	def convert_pdf(input_file):
	reader = PdfReader(input_file)
	metadata = extract_metadata_from_pdf(reader)
	text = extract_text_from_pdf(reader)

	# Check if there are any images
	image_count = 0
	for page in reader.pages:
	image_count += len(page.images)

	# If there are images and not much content, perform OCR on the document
	if image_count > 0 and len(text) < 1000:
	out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
	ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)

	# Re-extract text
	text = extract_text_from_pdf(PdfReader(input_file))

	# Delete the OCR file
	os.remove(out_pdf_file)

	return text, metadata


	def extract_text_from_pdf(reader):
	full_text = ""
	for idx, page in enumerate(reader.pages):
	text = page.extract_text()
	if len(text) > 0:
	full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"

	return full_text.strip()


	def extract_metadata_from_pdf(reader):
	return {
	"author": reader.metadata.author,
	"creator": reader.metadata.creator,
	"producer": reader.metadata.producer,
	"subject": reader.metadata.subject,
	"title": reader.metadata.title,
	}


	def convert_pandoc(input_file):
	# Convert the file to markdown with pandoc
	output_file = f"{random_word(16)}.md"
	result = subprocess.call(f"pandoc {input_file} -t markdown -o {output_file}")

	# Read the file and delete
	with open(output_file, "r") as f:
	markdown = f.read()
	os.remove(output_file)

	return markdown


	@spaces.GPU
	def convert(input_file):
	plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
	# Already a plain text file that wouldn't benefit from pandoc so return the content
	if any(input_file.endswith(ft) for ft in plain_text_filetypes):
	with open(input_file, "r") as f:
	return f.read()

	if input_file.endswith(".pdf"):
	return convert_pdf(input_file)

	return convert_pandoc(input_file)


	gr.Interface(
	convert,
	inputs=gr.File(label="Upload File", type="filepath"),
	outputs=gr.Text(label="Markdown"),
	).launch()