Arafath10 commited on
Commit
da2debc
·
verified ·
1 Parent(s): 1d9b15e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +21 -8
main.py CHANGED
@@ -1,6 +1,8 @@
1
  import io
2
  import asyncio
3
  import os
 
 
4
  from pathlib import Path
5
  import aiohttp
6
  from PyPDF2 import PdfReader, PdfWriter
@@ -31,7 +33,6 @@ async def call_pdfscraper(session, file_contents, pdf_name, processTables):
31
 
32
  return response, pdf_name
33
 
34
-
35
  async def execute_pdfscraper_async(file_path: str, processTables: str):
36
  chunk_list = os.listdir(file_path)
37
  chunk_byte_list = [
@@ -49,7 +50,6 @@ async def execute_pdfscraper_async(file_path: str, processTables: str):
49
 
50
  return response_list
51
 
52
-
53
  def collect_pdfscraper_response(scrape_response_list):
54
  content_list = []
55
  tables_dict = {}
@@ -71,12 +71,14 @@ def collect_pdfscraper_response(scrape_response_list):
71
 
72
  return content_str, tables_dict
73
 
74
-
75
  def split_pdf(file_contents, file_name, pages_per_chunk):
76
  file_bytes = io.BytesIO(file_contents)
77
  reader = PdfReader(file_bytes)
78
  total_pages = len(reader.pages)
79
- output_dir = Path(file_name).parent / "chunks"
 
 
 
80
  os.makedirs(output_dir, exist_ok=True)
81
 
82
  num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
@@ -96,21 +98,32 @@ def split_pdf(file_contents, file_name, pages_per_chunk):
96
 
97
  return str(output_dir)
98
 
99
-
100
  @app.post("/process-pdf/")
101
  async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
 
102
  file_contents = await pdf_file.read()
103
 
 
104
  chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
 
 
105
  scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
 
 
106
  content, table_string = collect_pdfscraper_response(scrape_response_list)
107
 
108
- shutil.rmtree(chunks_dir) # Clean up chunks after processing
 
 
 
 
 
 
109
 
110
  return JSONResponse(content={"content": content, "tables": table_string})
111
 
112
-
113
- # Starting point for running the FastAPI app
114
  # if __name__ == "__main__":
115
  # import uvicorn
116
  # uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
1
  import io
2
  import asyncio
3
  import os
4
+ import uuid
5
+ import logging
6
  from pathlib import Path
7
  import aiohttp
8
  from PyPDF2 import PdfReader, PdfWriter
 
33
 
34
  return response, pdf_name
35
 
 
36
  async def execute_pdfscraper_async(file_path: str, processTables: str):
37
  chunk_list = os.listdir(file_path)
38
  chunk_byte_list = [
 
50
 
51
  return response_list
52
 
 
53
  def collect_pdfscraper_response(scrape_response_list):
54
  content_list = []
55
  tables_dict = {}
 
71
 
72
  return content_str, tables_dict
73
 
 
74
  def split_pdf(file_contents, file_name, pages_per_chunk):
75
  file_bytes = io.BytesIO(file_contents)
76
  reader = PdfReader(file_bytes)
77
  total_pages = len(reader.pages)
78
+
79
+ # Generate a unique directory for each request to avoid conflicts
80
+ unique_dir = str(uuid.uuid4())
81
+ output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
82
  os.makedirs(output_dir, exist_ok=True)
83
 
84
  num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
 
98
 
99
  return str(output_dir)
100
 
 
101
  @app.post("/process-pdf/")
102
  async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
103
+ # Read the PDF file
104
  file_contents = await pdf_file.read()
105
 
106
+ # Split the PDF into chunks
107
  chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
108
+
109
+ # Asynchronously process the PDF chunks
110
  scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
111
+
112
+ # Collect the results
113
  content, table_string = collect_pdfscraper_response(scrape_response_list)
114
 
115
+ # Ensure the directory exists before attempting to delete it
116
+ if os.path.exists(chunks_dir):
117
+ try:
118
+ shutil.rmtree(chunks_dir) # Clean up chunks after processing
119
+ except Exception as e:
120
+ # Log any errors during cleanup
121
+ logging.error(f"Error deleting directory {chunks_dir}: {e}")
122
 
123
  return JSONResponse(content={"content": content, "tables": table_string})
124
 
125
+ # If you want to run this locally, uncomment the lines below.
 
126
  # if __name__ == "__main__":
127
  # import uvicorn
128
  # uvicorn.run(app, host="0.0.0.0", port=8000)
129
+ #uvicorn main:app --workers 2