|
import io |
|
import asyncio |
|
import os |
|
import uuid |
|
import logging |
|
from pathlib import Path |
|
import aiohttp |
|
from PyPDF2 import PdfReader, PdfWriter |
|
from fastapi import FastAPI, UploadFile, Form |
|
from fastapi.responses import JSONResponse |
|
from aiohttp import FormData |
|
import shutil |
|
|
|
app = FastAPI() |
|
|
|
async def call_pdfscraper(session, file_contents, pdf_name, processTables): |
|
headers = {"Origin": "http://localhost:8080"} |
|
url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" |
|
data = FormData() |
|
data.add_field( |
|
"pdf", |
|
file_contents, |
|
filename=os.path.basename(pdf_name), |
|
content_type="application/pdf", |
|
) |
|
data.add_field("processTables", processTables) |
|
|
|
async with session.post(url, data=data, headers=headers) as resp: |
|
if resp.status == 200: |
|
response = await resp.json() |
|
else: |
|
return {}, pdf_name |
|
|
|
return response, pdf_name |
|
|
|
async def execute_pdfscraper_async(file_path: str, processTables: str): |
|
chunk_list = os.listdir(file_path) |
|
chunk_byte_list = [ |
|
(open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list |
|
] |
|
response_list = [] |
|
async with aiohttp.ClientSession() as session: |
|
tasks = [ |
|
call_pdfscraper(session, file_all[0], file_all[1], processTables) |
|
for file_all in chunk_byte_list |
|
] |
|
responses = await asyncio.gather(*tasks) |
|
for i, response in enumerate(responses): |
|
response_list.append(response[0]) |
|
|
|
return response_list |
|
|
|
def collect_pdfscraper_response(scrape_response_list): |
|
content_list = [] |
|
tables_dict = {} |
|
table_count = 1 |
|
for response in scrape_response_list: |
|
content = response.get("corpus", "") |
|
table_content = response.get("tables_raw", {}) |
|
|
|
content_list.append(content) |
|
try: |
|
for table_key in table_content.keys(): |
|
tables_dict[str(table_count)] = table_content[table_key] |
|
table_count += 1 |
|
|
|
except AttributeError: |
|
pass |
|
|
|
content_str = "\n".join(content_list) |
|
|
|
return content_str, tables_dict |
|
|
|
def split_pdf(file_contents, file_name, pages_per_chunk): |
|
file_bytes = io.BytesIO(file_contents) |
|
reader = PdfReader(file_bytes) |
|
total_pages = len(reader.pages) |
|
|
|
|
|
unique_dir = str(uuid.uuid4()) |
|
output_dir = Path(file_name).parent / f"chunks_{unique_dir}" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk |
|
|
|
for i in range(num_chunks): |
|
writer = PdfWriter() |
|
start_page = i * pages_per_chunk |
|
end_page = min(start_page + pages_per_chunk, total_pages) |
|
|
|
for page_number in range(start_page, end_page): |
|
writer.add_page(reader.pages[page_number]) |
|
|
|
chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" |
|
output_path = output_dir / chunk_file_name |
|
with open(output_path, "wb") as output_pdf: |
|
writer.write(output_pdf) |
|
|
|
return str(output_dir) |
|
|
|
@app.post("/process-pdf/") |
|
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")): |
|
|
|
file_contents = await pdf_file.read() |
|
|
|
|
|
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk) |
|
|
|
|
|
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables) |
|
|
|
|
|
content, table_string = collect_pdfscraper_response(scrape_response_list) |
|
|
|
|
|
if os.path.exists(chunks_dir): |
|
try: |
|
shutil.rmtree(chunks_dir) |
|
except Exception as e: |
|
|
|
logging.error(f"Error deleting directory {chunks_dir}: {e}") |
|
|
|
return JSONResponse(content={"content": content, "tables": table_string}) |
|
|
|
|
|
|
|
|
|
|
|
|