Update main.py
Browse files
main.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import io
|
2 |
import asyncio
|
3 |
import os
|
|
|
|
|
4 |
from pathlib import Path
|
5 |
import aiohttp
|
6 |
from PyPDF2 import PdfReader, PdfWriter
|
@@ -31,7 +33,6 @@ async def call_pdfscraper(session, file_contents, pdf_name, processTables):
|
|
31 |
|
32 |
return response, pdf_name
|
33 |
|
34 |
-
|
35 |
async def execute_pdfscraper_async(file_path: str, processTables: str):
|
36 |
chunk_list = os.listdir(file_path)
|
37 |
chunk_byte_list = [
|
@@ -49,7 +50,6 @@ async def execute_pdfscraper_async(file_path: str, processTables: str):
|
|
49 |
|
50 |
return response_list
|
51 |
|
52 |
-
|
53 |
def collect_pdfscraper_response(scrape_response_list):
|
54 |
content_list = []
|
55 |
tables_dict = {}
|
@@ -71,12 +71,14 @@ def collect_pdfscraper_response(scrape_response_list):
|
|
71 |
|
72 |
return content_str, tables_dict
|
73 |
|
74 |
-
|
75 |
def split_pdf(file_contents, file_name, pages_per_chunk):
|
76 |
file_bytes = io.BytesIO(file_contents)
|
77 |
reader = PdfReader(file_bytes)
|
78 |
total_pages = len(reader.pages)
|
79 |
-
|
|
|
|
|
|
|
80 |
os.makedirs(output_dir, exist_ok=True)
|
81 |
|
82 |
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
|
@@ -96,21 +98,32 @@ def split_pdf(file_contents, file_name, pages_per_chunk):
|
|
96 |
|
97 |
return str(output_dir)
|
98 |
|
99 |
-
|
100 |
@app.post("/process-pdf/")
|
101 |
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
|
|
|
102 |
file_contents = await pdf_file.read()
|
103 |
|
|
|
104 |
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
|
|
|
|
|
105 |
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
|
|
|
|
|
106 |
content, table_string = collect_pdfscraper_response(scrape_response_list)
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
return JSONResponse(content={"content": content, "tables": table_string})
|
111 |
|
112 |
-
|
113 |
-
# Starting point for running the FastAPI app
|
114 |
# if __name__ == "__main__":
|
115 |
# import uvicorn
|
116 |
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
1 |
import io
|
2 |
import asyncio
|
3 |
import os
|
4 |
+
import uuid
|
5 |
+
import logging
|
6 |
from pathlib import Path
|
7 |
import aiohttp
|
8 |
from PyPDF2 import PdfReader, PdfWriter
|
|
|
33 |
|
34 |
return response, pdf_name
|
35 |
|
|
|
36 |
async def execute_pdfscraper_async(file_path: str, processTables: str):
|
37 |
chunk_list = os.listdir(file_path)
|
38 |
chunk_byte_list = [
|
|
|
50 |
|
51 |
return response_list
|
52 |
|
|
|
53 |
def collect_pdfscraper_response(scrape_response_list):
|
54 |
content_list = []
|
55 |
tables_dict = {}
|
|
|
71 |
|
72 |
return content_str, tables_dict
|
73 |
|
|
|
74 |
def split_pdf(file_contents, file_name, pages_per_chunk):
|
75 |
file_bytes = io.BytesIO(file_contents)
|
76 |
reader = PdfReader(file_bytes)
|
77 |
total_pages = len(reader.pages)
|
78 |
+
|
79 |
+
# Generate a unique directory for each request to avoid conflicts
|
80 |
+
unique_dir = str(uuid.uuid4())
|
81 |
+
output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
|
82 |
os.makedirs(output_dir, exist_ok=True)
|
83 |
|
84 |
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
|
|
|
98 |
|
99 |
return str(output_dir)
|
100 |
|
|
|
101 |
@app.post("/process-pdf/")
|
102 |
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
|
103 |
+
# Read the PDF file
|
104 |
file_contents = await pdf_file.read()
|
105 |
|
106 |
+
# Split the PDF into chunks
|
107 |
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
|
108 |
+
|
109 |
+
# Asynchronously process the PDF chunks
|
110 |
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
|
111 |
+
|
112 |
+
# Collect the results
|
113 |
content, table_string = collect_pdfscraper_response(scrape_response_list)
|
114 |
|
115 |
+
# Ensure the directory exists before attempting to delete it
|
116 |
+
if os.path.exists(chunks_dir):
|
117 |
+
try:
|
118 |
+
shutil.rmtree(chunks_dir) # Clean up chunks after processing
|
119 |
+
except Exception as e:
|
120 |
+
# Log any errors during cleanup
|
121 |
+
logging.error(f"Error deleting directory {chunks_dir}: {e}")
|
122 |
|
123 |
return JSONResponse(content={"content": content, "tables": table_string})
|
124 |
|
125 |
+
# If you want to run this locally, uncomment the lines below.
|
|
|
126 |
# if __name__ == "__main__":
|
127 |
# import uvicorn
|
128 |
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
129 |
+
#uvicorn main:app --workers 2
|