Spaces:

heerk30
/

docker-testing

Running

App Files Files Community

docker-testing / router.py

heerk30

Upload 5 files

cac5839 verified about 1 month ago

raw

history blame contribute delete

2.08 kB

	from fastapi import APIRouter, UploadFile, File, Query
	import os
	from PIL import Image
	import pytesseract
	from pdf2image import convert_from_path
	from app.services.vector_store import add_document, query_documents
	import uuid

	router = APIRouter()

	UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'data', 'uploads')
	os.makedirs(UPLOAD_DIR, exist_ok=True)

	@router.get("/ping")
	def ping():
	return {"message": "pong"}

	@router.post("/upload-document")
	def upload_document(file: UploadFile = File(...)):
	file_location = os.path.join(UPLOAD_DIR, file.filename)
	with open(file_location, "wb") as f:
	f.write(file.file.read())

	extracted_text = ""
	if file.filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
	# OCR for images
	image = Image.open(file_location)
	extracted_text = pytesseract.image_to_string(image)
	elif file.filename.lower().endswith('.pdf'):
	# OCR for PDFs
	try:
	pages = convert_from_path(file_location)
	text_list = [pytesseract.image_to_string(page) for page in pages]
	extracted_text = "\n".join(text_list)
	except Exception as e:
	extracted_text = f"Error processing PDF: {str(e)}"
	else:
	extracted_text = "OCR not supported for this file type."

	# Save extracted text
	text_filename = file.filename + ".txt"
	text_path = os.path.join(UPLOAD_DIR, text_filename)
	with open(text_path, "w", encoding="utf-8") as tf:
	tf.write(extracted_text)

	# Store in vector DB
	doc_id = str(uuid.uuid4())
	add_document(doc_id, extracted_text, metadata={"filename": file.filename, "path": file_location})

	return {"filename": file.filename, "saved_to": file_location, "ocr_text_file": text_path, "vector_id": doc_id}

	@router.get("/search")
	def search_documents(query: str = Query(..., description="Search query"), n_results: int = 3):
	results = query_documents(query, n_results)
	return results