Spaces:
Running
Running
Update document_generator_v2.py
Browse files- document_generator_v2.py +40 -0
document_generator_v2.py
CHANGED
@@ -172,6 +172,7 @@ import psycopg2
|
|
172 |
from datetime import datetime
|
173 |
import base64
|
174 |
from fastapi import Form
|
|
|
175 |
|
176 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
177 |
logger = logging.getLogger(__name__)
|
@@ -448,6 +449,45 @@ class MarkdownConverter:
|
|
448 |
markdown += "</div>"
|
449 |
return markdown
|
450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
router = APIRouter()
|
452 |
|
453 |
class JsonDocumentResponse(BaseModel):
|
|
|
172 |
from datetime import datetime
|
173 |
import base64
|
174 |
from fastapi import Form
|
175 |
+
from llama_parse import LlamaParse
|
176 |
|
177 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
178 |
logger = logging.getLogger(__name__)
|
|
|
449 |
markdown += "</div>"
|
450 |
return markdown
|
451 |
|
452 |
+
async def load_documents(documents: List[UploadFile]) -> List[str]:
|
453 |
+
"""
|
454 |
+
Load and parse documents using LlamaParse.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
documents (List[UploadFile]): List of uploaded document files.
|
458 |
+
|
459 |
+
Returns:
|
460 |
+
List[str]: List of parsed document contents.
|
461 |
+
"""
|
462 |
+
parser = LlamaParse(
|
463 |
+
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
|
464 |
+
result_type="markdown",
|
465 |
+
num_workers=4,
|
466 |
+
verbose=True,
|
467 |
+
language="en",
|
468 |
+
)
|
469 |
+
|
470 |
+
# Save uploaded files temporarily
|
471 |
+
temp_files = []
|
472 |
+
for doc in documents:
|
473 |
+
temp_file_path = f"/tmp/{doc.filename}"
|
474 |
+
with open(temp_file_path, "wb") as buffer:
|
475 |
+
content = await doc.read()
|
476 |
+
buffer.write(content)
|
477 |
+
temp_files.append(temp_file_path)
|
478 |
+
|
479 |
+
try:
|
480 |
+
# Use LlamaParse to extract content
|
481 |
+
parsed_documents = await parser.aload_data(temp_files)
|
482 |
+
documents_list = [doc.text for doc in parsed_documents]
|
483 |
+
return documents_list
|
484 |
+
finally:
|
485 |
+
# Clean up temporary files
|
486 |
+
for temp_file in temp_files:
|
487 |
+
os.remove(temp_file)
|
488 |
+
|
489 |
+
|
490 |
+
|
491 |
router = APIRouter()
|
492 |
|
493 |
class JsonDocumentResponse(BaseModel):
|