Spaces:

vasilee
/

pdf-extract

Sleeping

App Files Files Community

vasilee commited on Sep 25

Commit

63069dd

1 Parent(s): b68df3b

extract text from pdf

Browse files

Files changed (4) hide show

.gitignore +3 -1
app.py +98 -2
requirements.txt +4 -2
test_pdf_endpoint.py +84 -0

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- .venv/

+.venv
+__pycache__
+test.pdf

app.py CHANGED Viewed

@@ -1,7 +1,103 @@
-from fastapi import FastAPI
 app = FastAPI()
 @app.get("/")
 def greet_json():
-    return {"Hello": "World!"}

+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+import pypdfium2 as pdfium
+import base64
+import re
 app = FastAPI()
+def extract_text_from_pdf(pdf_bytes: bytes) -> str:
+    """
+    Extract text from PDF bytes using pypdfium2
+    """
+    pdf_file = pdfium.PdfDocument(pdf_bytes)
+    text_parts = []
+    try:
+        for page in pdf_file:
+            textpage = page.get_textpage()
+            text = textpage.get_text_range()
+            text_parts.append(text)
+    finally:
+        pdf_file.close()
+    return "\n".join(text_parts)
 @app.get("/")
 def greet_json():
+    return {"Hello": "World!"}
+@app.post("/extract-text")
+async def extract_pdf_text(file: UploadFile = File(...)):
+    """
+    Endpoint to extract text from uploaded PDF file
+    """
+    # Check if the uploaded file is a PDF
+    if not file.filename.lower().endswith('.pdf'):
+        return JSONResponse(
+            status_code=400,
+            content={"error": "Only PDF files are supported"}
+        )
+    # Read the file content
+    content = await file.read()
+    try:
+        # Extract text from PDF
+        extracted_text = extract_text_from_pdf(content)
+        return {
+            "filename": file.filename,
+            "text": extracted_text
+        }
+    except Exception as e:
+        return JSONResponse(
+            status_code=500,
+            content={"error": f"Failed to extract text: {str(e)}"}
+        )
+@app.post("/extract-text-base64")
+async def extract_pdf_text_base64(data: dict):
+    """
+    Endpoint to extract text from PDF provided as base64 encoded string
+    """
+    # Check if 'file' key exists in request
+    if 'file' not in data:
+        return JSONResponse(
+            status_code=400,
+            content={"error": "Missing 'file' field in request body"}
+        )
+    # Get the base64 encoded string
+    base64_string = data['file']
+    # Extract filename if provided
+    filename = data.get('filename', 'unknown.pdf')
+    try:
+        # Handle data URL format (e.g., "data:application/pdf;base64,...")
+        if base64_string.startswith('data:'):
+            # Extract the base64 part after the comma
+            match = re.search(r'base64,(.*)', base64_string)
+            if match:
+                base64_string = match.group(1)
+            else:
+                return JSONResponse(
+                    status_code=400,
+                    content={"error": "Invalid data URL format"}
+                )
+        pdf_bytes = base64.b64decode(base64_string)
+        # Extract text from PDF
+        extracted_text = extract_text_from_pdf(pdf_bytes)
+        return {
+            "filename": filename,
+            "text": extracted_text
+        }
+    except Exception as e:
+        return JSONResponse(
+            status_code=500,
+            content={"error": f"Failed to process base64 PDF: {str(e)}"}
+        )

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
-fastapi
-uvicorn[standard]

+fastapi[all]
+aiohttp
+uvicorn[standard]
+pypdfium2

test_pdf_endpoint.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import asyncio
+import aiohttp
+import sys
+import base64
+async def test_pdf_extraction():
+    # Check if filename is provided as argument
+    if len(sys.argv) < 2:
+        print("Usage: python test_pdf_endpoint.py <pdf_filename>")
+        return
+    pdf_filename = sys.argv[1]
+    # Read the PDF file
+    try:
+        with open(pdf_filename, 'rb') as f:
+            pdf_content = f.read()
+    except FileNotFoundError:
+        print(f"Error: File '{pdf_filename}' not found.")
+        return
+    except Exception as e:
+        print(f"Error reading file: {e}")
+        return
+    # Test regular file upload endpoint
+    print("\n--- Testing file upload endpoint ---")
+    url = "http://localhost:8000/extract-text"
+    try:
+        async with aiohttp.ClientSession() as session:
+            data = aiohttp.FormData()
+            data.add_field('file', pdf_content, filename=pdf_filename, content_type='application/pdf')
+            async with session.post(
+                url,
+                data=data
+            ) as response:
+                result = await response.json()
+                print(f"Status code: {response.status}")
+                if response.status == 200:
+                    print(f"Successfully extracted text from {result['filename']}")
+                    print(f"Text length: {len(result['text'])} characters")
+                    # Print first 500 characters of extracted text
+                    print(f"First 500 characters: {result['text'][:500]}")
+                else:
+                    print(f"Error: {result}")
+    except Exception as e:
+        print(f"Error connecting to server: {e}")
+        print("Make sure the FastAPI server is running on port 8000")
+    # Test base64 endpoint
+    print("\n--- Testing base64 endpoint ---")
+    base64_url = "http://localhost:8000/extract-text-base64"
+    try:
+        # Encode the PDF content to base64
+        base64_string = base64.b64encode(pdf_content).decode('utf-8')
+        # Create JSON payload
+        payload = {
+            "file": base64_string,
+            "filename": pdf_filename
+        }
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                base64_url,
+                json=payload
+            ) as response:
+                result = await response.json()
+                print(f"Status code: {response.status}")
+                if response.status == 200:
+                    print(f"Successfully extracted text from {result['filename']}")
+                    print(f"Text length: {len(result['text'])} characters")
+                    # Print first 500 characters of extracted text
+                    print(f"First 500 characters: {result['text'][:500]}")
+                else:
+                    print(f"Error: {result}")
+    except Exception as e:
+        print(f"Error connecting to server: {e}")
+        print("Make sure the FastAPI server is running on port 8000")
+if __name__ == "__main__":
+    asyncio.run(test_pdf_extraction())