Spaces:
Runtime error
Runtime error
Added LLama Module
#2
by
chandanzeon
- opened
app.py
CHANGED
|
@@ -19,12 +19,17 @@ if uploaded_file:
|
|
| 19 |
|
| 20 |
try:
|
| 21 |
with st.spinner("Processing document..."):
|
| 22 |
-
docspaddle, docsdocling = process_docs(doc_path)
|
| 23 |
if os.path.exists("./Tested_Docs"):
|
| 24 |
shutil.rmtree('./Tested_Docs')
|
| 25 |
except Exception as e:
|
| 26 |
st.warning(e)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
st.markdown("### Extracted Text by Docling-OCR :")
|
| 29 |
for page_number, txt in docsdocling.items():
|
| 30 |
st.markdown(f"#### Page {page_number}")
|
|
@@ -33,4 +38,4 @@ if uploaded_file:
|
|
| 33 |
st.markdown("### Extracted Text by Paddle-OCR :")
|
| 34 |
for page_number, txt in enumerate(docspaddle):
|
| 35 |
st.markdown(f"#### Page {page_number+1}")
|
| 36 |
-
st.text(txt)
|
|
|
|
| 19 |
|
| 20 |
try:
|
| 21 |
with st.spinner("Processing document..."):
|
| 22 |
+
docsllama, docspaddle, docsdocling = process_docs(doc_path)
|
| 23 |
if os.path.exists("./Tested_Docs"):
|
| 24 |
shutil.rmtree('./Tested_Docs')
|
| 25 |
except Exception as e:
|
| 26 |
st.warning(e)
|
| 27 |
|
| 28 |
+
st.markdown("### Extracted Text by Llama-Parser :")
|
| 29 |
+
for page_number, txt in enumerate(docsllama):
|
| 30 |
+
st.markdown(f"#### Page {page_number+1}")
|
| 31 |
+
st.text(txt)
|
| 32 |
+
|
| 33 |
st.markdown("### Extracted Text by Docling-OCR :")
|
| 34 |
for page_number, txt in docsdocling.items():
|
| 35 |
st.markdown(f"#### Page {page_number}")
|
|
|
|
| 38 |
st.markdown("### Extracted Text by Paddle-OCR :")
|
| 39 |
for page_number, txt in enumerate(docspaddle):
|
| 40 |
st.markdown(f"#### Page {page_number+1}")
|
| 41 |
+
st.text(txt)
|
helper.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
from docling.document_converter import DocumentConverter
|
| 2 |
from paddleocr import PaddleOCR
|
|
|
|
| 3 |
from pdf2image import convert_from_path
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def process_text(res):
|
| 7 |
page_texts = {}
|
|
@@ -73,6 +76,17 @@ def process_docs(doc_path):
|
|
| 73 |
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
|
| 74 |
"""
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
## Paddle OCR
|
| 77 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
| 78 |
images_pdf = convert_from_path(doc_path, 300)
|
|
@@ -90,4 +104,4 @@ def process_docs(doc_path):
|
|
| 90 |
docs3 = process_text(res)
|
| 91 |
docs3 = process_tables(res,docs3)
|
| 92 |
|
| 93 |
-
return docs2,docs3
|
|
|
|
| 1 |
from docling.document_converter import DocumentConverter
|
| 2 |
from paddleocr import PaddleOCR
|
| 3 |
+
from llama_parse import LlamaParse
|
| 4 |
from pdf2image import convert_from_path
|
| 5 |
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
llama_key = os.getenv('LLAMA_INDEX_API_KEY')
|
| 8 |
|
| 9 |
def process_text(res):
|
| 10 |
page_texts = {}
|
|
|
|
| 76 |
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
|
| 77 |
"""
|
| 78 |
|
| 79 |
+
## LLama Parser
|
| 80 |
+
parser = LlamaParse(
|
| 81 |
+
api_key=llama_key,
|
| 82 |
+
result_type='markdown',
|
| 83 |
+
verbose=True,
|
| 84 |
+
language='en',
|
| 85 |
+
num_workers=2
|
| 86 |
+
)
|
| 87 |
+
documents = parser.load_data(doc_path)
|
| 88 |
+
docs = [doc.text for doc in documents]
|
| 89 |
+
|
| 90 |
## Paddle OCR
|
| 91 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
| 92 |
images_pdf = convert_from_path(doc_path, 300)
|
|
|
|
| 104 |
docs3 = process_text(res)
|
| 105 |
docs3 = process_tables(res,docs3)
|
| 106 |
|
| 107 |
+
return docs, docs2, docs3
|