Spaces:
Sleeping
Sleeping
# | |
# Reads "tbot-dataset/" + name + ".pdf" | |
# Outputs "tbot-dataset/" + name + ".json" | |
# | |
import PyPDF2 | |
from langchain.schema import Document | |
import json | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
with open(pdf_path, "rb") as file: | |
reader = PyPDF2.PdfReader(file) | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
def create_langchain_document(page_content, metadata=None): | |
return Document(page_content=page_content, metadata=metadata) | |
def save_document_to_json(document, json_path): | |
with open(json_path, 'w') as json_file: | |
json.dump(document.dict(), json_file) | |
# Paths for the PDF and the JSON file | |
name = "PFC ebook" | |
pdf_path = "tbot-dataset/" + name + ".pdf" | |
json_path = "tbot-dataset/" + name + ".json" | |
# Extract text from PDF | |
extracted_text = extract_text_from_pdf(pdf_path) | |
# Create a LangChain document | |
metadata = {"source": pdf_path} | |
langchain_document = create_langchain_document(extracted_text, metadata) | |
# Save the LangChain document as a JSON file | |
save_document_to_json(langchain_document, json_path) | |
print(f"Document saved to {json_path}") | |