tbot-space / pdf-textbook-to-json.py
ajayvaria's picture
PDF to json converter
a24a077
raw
history blame
1.14 kB
#
# Reads "tbot-dataset/" + name + ".pdf"
# Outputs "tbot-dataset/" + name + ".json"
#
import PyPDF2
from langchain.schema import Document
import json
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def create_langchain_document(page_content, metadata=None):
return Document(page_content=page_content, metadata=metadata)
def save_document_to_json(document, json_path):
with open(json_path, 'w') as json_file:
json.dump(document.dict(), json_file)
# Paths for the PDF and the JSON file
name = "PFC ebook"
pdf_path = "tbot-dataset/" + name + ".pdf"
json_path = "tbot-dataset/" + name + ".json"
# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Create a LangChain document
metadata = {"source": pdf_path}
langchain_document = create_langchain_document(extracted_text, metadata)
# Save the LangChain document as a JSON file
save_document_to_json(langchain_document, json_path)
print(f"Document saved to {json_path}")