vasilee commited on
Commit
63069dd
·
1 Parent(s): b68df3b

extract text from pdf

Browse files
Files changed (4) hide show
  1. .gitignore +3 -1
  2. app.py +98 -2
  3. requirements.txt +4 -2
  4. test_pdf_endpoint.py +84 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
- .venv/
 
 
 
1
+ .venv
2
+ __pycache__
3
+ test.pdf
app.py CHANGED
@@ -1,7 +1,103 @@
1
- from fastapi import FastAPI
 
 
 
 
2
 
3
  app = FastAPI()
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  @app.get("/")
6
  def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ import pypdfium2 as pdfium
4
+ import base64
5
+ import re
6
 
7
  app = FastAPI()
8
 
9
+ def extract_text_from_pdf(pdf_bytes: bytes) -> str:
10
+ """
11
+ Extract text from PDF bytes using pypdfium2
12
+ """
13
+ pdf_file = pdfium.PdfDocument(pdf_bytes)
14
+ text_parts = []
15
+
16
+ try:
17
+ for page in pdf_file:
18
+ textpage = page.get_textpage()
19
+ text = textpage.get_text_range()
20
+ text_parts.append(text)
21
+ finally:
22
+ pdf_file.close()
23
+
24
+ return "\n".join(text_parts)
25
+
26
  @app.get("/")
27
  def greet_json():
28
+ return {"Hello": "World!"}
29
+
30
+ @app.post("/extract-text")
31
+ async def extract_pdf_text(file: UploadFile = File(...)):
32
+ """
33
+ Endpoint to extract text from uploaded PDF file
34
+ """
35
+ # Check if the uploaded file is a PDF
36
+ if not file.filename.lower().endswith('.pdf'):
37
+ return JSONResponse(
38
+ status_code=400,
39
+ content={"error": "Only PDF files are supported"}
40
+ )
41
+
42
+ # Read the file content
43
+ content = await file.read()
44
+
45
+ try:
46
+ # Extract text from PDF
47
+ extracted_text = extract_text_from_pdf(content)
48
+
49
+ return {
50
+ "filename": file.filename,
51
+ "text": extracted_text
52
+ }
53
+ except Exception as e:
54
+ return JSONResponse(
55
+ status_code=500,
56
+ content={"error": f"Failed to extract text: {str(e)}"}
57
+ )
58
+
59
+ @app.post("/extract-text-base64")
60
+ async def extract_pdf_text_base64(data: dict):
61
+ """
62
+ Endpoint to extract text from PDF provided as base64 encoded string
63
+ """
64
+ # Check if 'file' key exists in request
65
+ if 'file' not in data:
66
+ return JSONResponse(
67
+ status_code=400,
68
+ content={"error": "Missing 'file' field in request body"}
69
+ )
70
+
71
+ # Get the base64 encoded string
72
+ base64_string = data['file']
73
+
74
+ # Extract filename if provided
75
+ filename = data.get('filename', 'unknown.pdf')
76
+
77
+ try:
78
+ # Handle data URL format (e.g., "data:application/pdf;base64,...")
79
+ if base64_string.startswith('data:'):
80
+ # Extract the base64 part after the comma
81
+ match = re.search(r'base64,(.*)', base64_string)
82
+ if match:
83
+ base64_string = match.group(1)
84
+ else:
85
+ return JSONResponse(
86
+ status_code=400,
87
+ content={"error": "Invalid data URL format"}
88
+ )
89
+
90
+ pdf_bytes = base64.b64decode(base64_string)
91
+
92
+ # Extract text from PDF
93
+ extracted_text = extract_text_from_pdf(pdf_bytes)
94
+
95
+ return {
96
+ "filename": filename,
97
+ "text": extracted_text
98
+ }
99
+ except Exception as e:
100
+ return JSONResponse(
101
+ status_code=500,
102
+ content={"error": f"Failed to process base64 PDF: {str(e)}"}
103
+ )
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
- fastapi
2
- uvicorn[standard]
 
 
 
1
+ fastapi[all]
2
+ aiohttp
3
+ uvicorn[standard]
4
+ pypdfium2
test_pdf_endpoint.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import sys
4
+ import base64
5
+
6
+ async def test_pdf_extraction():
7
+ # Check if filename is provided as argument
8
+ if len(sys.argv) < 2:
9
+ print("Usage: python test_pdf_endpoint.py <pdf_filename>")
10
+ return
11
+
12
+ pdf_filename = sys.argv[1]
13
+
14
+ # Read the PDF file
15
+ try:
16
+ with open(pdf_filename, 'rb') as f:
17
+ pdf_content = f.read()
18
+ except FileNotFoundError:
19
+ print(f"Error: File '{pdf_filename}' not found.")
20
+ return
21
+ except Exception as e:
22
+ print(f"Error reading file: {e}")
23
+ return
24
+
25
+ # Test regular file upload endpoint
26
+ print("\n--- Testing file upload endpoint ---")
27
+ url = "http://localhost:8000/extract-text"
28
+
29
+ try:
30
+ async with aiohttp.ClientSession() as session:
31
+ data = aiohttp.FormData()
32
+ data.add_field('file', pdf_content, filename=pdf_filename, content_type='application/pdf')
33
+
34
+ async with session.post(
35
+ url,
36
+ data=data
37
+ ) as response:
38
+ result = await response.json()
39
+ print(f"Status code: {response.status}")
40
+ if response.status == 200:
41
+ print(f"Successfully extracted text from {result['filename']}")
42
+ print(f"Text length: {len(result['text'])} characters")
43
+ # Print first 500 characters of extracted text
44
+ print(f"First 500 characters: {result['text'][:500]}")
45
+ else:
46
+ print(f"Error: {result}")
47
+ except Exception as e:
48
+ print(f"Error connecting to server: {e}")
49
+ print("Make sure the FastAPI server is running on port 8000")
50
+
51
+ # Test base64 endpoint
52
+ print("\n--- Testing base64 endpoint ---")
53
+ base64_url = "http://localhost:8000/extract-text-base64"
54
+
55
+ try:
56
+ # Encode the PDF content to base64
57
+ base64_string = base64.b64encode(pdf_content).decode('utf-8')
58
+
59
+ # Create JSON payload
60
+ payload = {
61
+ "file": base64_string,
62
+ "filename": pdf_filename
63
+ }
64
+
65
+ async with aiohttp.ClientSession() as session:
66
+ async with session.post(
67
+ base64_url,
68
+ json=payload
69
+ ) as response:
70
+ result = await response.json()
71
+ print(f"Status code: {response.status}")
72
+ if response.status == 200:
73
+ print(f"Successfully extracted text from {result['filename']}")
74
+ print(f"Text length: {len(result['text'])} characters")
75
+ # Print first 500 characters of extracted text
76
+ print(f"First 500 characters: {result['text'][:500]}")
77
+ else:
78
+ print(f"Error: {result}")
79
+ except Exception as e:
80
+ print(f"Error connecting to server: {e}")
81
+ print("Make sure the FastAPI server is running on port 8000")
82
+
83
+ if __name__ == "__main__":
84
+ asyncio.run(test_pdf_extraction())