Spaces:

teacher-bot
/

tbot-space

Sleeping

tbot-space / pdf-textbook-to-json.py

PDF to json converter

a24a077 10 months ago

1.14 kB

	#
	# Reads "tbot-dataset/" + name + ".pdf"
	# Outputs "tbot-dataset/" + name + ".json"
	#
	import PyPDF2
	from langchain.schema import Document
	import json

	def extract_text_from_pdf(pdf_path):
	text = ""
	with open(pdf_path, "rb") as file:
	reader = PyPDF2.PdfReader(file)
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	def create_langchain_document(page_content, metadata=None):
	return Document(page_content=page_content, metadata=metadata)

	def save_document_to_json(document, json_path):
	with open(json_path, 'w') as json_file:
	json.dump(document.dict(), json_file)

	# Paths for the PDF and the JSON file
	name = "PFC ebook"
	pdf_path = "tbot-dataset/" + name + ".pdf"
	json_path = "tbot-dataset/" + name + ".json"

	# Extract text from PDF
	extracted_text = extract_text_from_pdf(pdf_path)

	# Create a LangChain document
	metadata = {"source": pdf_path}
	langchain_document = create_langchain_document(extracted_text, metadata)

	# Save the LangChain document as a JSON file
	save_document_to_json(langchain_document, json_path)

	print(f"Document saved to {json_path}")