DocQA / non_form_llama_parse.py
likhithv's picture
initial commit
cc9f92c
raw
history blame
783 Bytes
from llama_parse import LlamaParse
from dotenv import load_dotenv
import os
import streamlit as st
load_dotenv()
LLAMA_PARSE = os.getenv('LLAMA_PARSE')
parser = LlamaParse(
api_key = LLAMA_PARSE,
result_type="text", # "markdown" and "text" are available
num_workers=4, # if multiple files passed, split in `num_workers` API calls
verbose=True,
language="en" # Optionaly you can define a language, default=en
)
@st.cache_data
def extract_text(pdf_path):
documents = parser.load_data(pdf_path)
all_text = ""
for document in documents:
all_text += document.text + '\n'
return all_text.strip() # Remove the trailing newline character
# combined_text = extract_text("/app/Non_form_pdfs/chapter-17-web-designing2.pdf")
# print(combined_text)