pdf-extract / test_pdf_endpoint.py
vasilee's picture
extract text and tables
1a755c0
import asyncio
import aiohttp
import sys
import base64
async def test_pdf_extraction():
# Check if filename is provided as argument
if len(sys.argv) < 2:
print("Usage: python test_pdf_endpoint.py <pdf_filename> [page_numbers]")
return
pdf_filename = sys.argv[1]
page_numbers = sys.argv[2] if len(sys.argv) > 2 else None
# Read the PDF file
try:
with open(pdf_filename, 'rb') as f:
pdf_content = f.read()
except FileNotFoundError:
print(f"Error: File '{pdf_filename}' not found.")
return
except Exception as e:
print(f"Error reading file: {e}")
return
# Test regular file upload endpoint
print("\n--- Testing file upload endpoint ---")
url = "http://localhost:8000/extract-text"
if page_numbers:
url += f"?page_numbers={page_numbers}"
try:
async with aiohttp.ClientSession() as session:
data = aiohttp.FormData()
data.add_field('file', pdf_content, filename=pdf_filename, content_type='application/pdf')
async with session.post(
url,
data=data
) as response:
result = await response.json()
print(f"Status code: {response.status}")
if response.status == 200:
print(f"Successfully extracted text from {result['filename']}")
print(f"Text length: {len(result['text'])} characters")
# Print first 500 characters of extracted text
print(f"First 500 characters: {result['text'][:500]}")
else:
print(f"Error: {result}")
except Exception as e:
print(f"Error connecting to server: {e}")
print("Make sure the FastAPI server is running on port 8000")
# Test base64 endpoint
print("\n--- Testing base64 endpoint ---")
base64_url = "http://localhost:8000/extract-text-base64"
try:
# Encode the PDF content to base64
base64_string = base64.b64encode(pdf_content).decode('utf-8')
# Create JSON payload
payload = {
"file": base64_string,
"filename": pdf_filename
}
if page_numbers:
payload["page_numbers"] = page_numbers
async with aiohttp.ClientSession() as session:
async with session.post(
base64_url,
json=payload
) as response:
result = await response.json()
print(f"Status code: {response.status}")
if response.status == 200:
print(f"Successfully extracted text from {result['filename']}")
print(f"Text length: {len(result['text'])} characters")
# Print first 500 characters of extracted text
print(f"First 500 characters: {result['text'][:500]}")
else:
print(f"Error: {result}")
except Exception as e:
print(f"Error connecting to server: {e}")
print("Make sure the FastAPI server is running on port 8000")
if __name__ == "__main__":
asyncio.run(test_pdf_extraction())