from llama_parse import LlamaParse
from dotenv import load_dotenv
import os
import streamlit as st

load_dotenv()
LLAMA_PARSE = os.getenv('LLAMA_PARSE')

parser = LlamaParse(
    api_key = LLAMA_PARSE,
    result_type="text",  # "markdown" and "text" are available
    num_workers=4, # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en" # Optionaly you can define a language, default=en
)

@st.cache_data
def extract_text(pdf_path):
    documents = parser.load_data(pdf_path)
    all_text = ""
    for document in documents:
        all_text += document.text + '\n'
    return all_text.strip()  # Remove the trailing newline character

# combined_text = extract_text("/app/Non_form_pdfs/chapter-17-web-designing2.pdf")
# print(combined_text)