from llama_parse import LlamaParse from llama_index.core import SimpleDirectoryReader from uuid import uuid4 from .base import Document from loguru import logger import os from dotenv import load_dotenv load_dotenv() # set up parser parser = LlamaParse( api_key=os.getenv("LLAMA_PARSE_API_KEY"), result_type="markdown", # "markdown" and "text" are available ) def convert_pdf_to_text(filepaths: list[str]) -> Document: try: file_extractor = {".pdf": parser} # use SimpleDirectoryReader to parse our file documents = SimpleDirectoryReader( input_files=filepaths, file_extractor=file_extractor ).load_data() logger.info("Converted 1 documents") return Document( document_id=uuid4(), text=" ".join(document.text for document in documents), metadata={"filename": filepaths[0].split("/")[-1]}, ) except Exception as e: logger.error(f"Error converting PDF to text: {e}") raise e