Spaces:

heerk30
/

docker-testing

Running

App Files Files Community

docker-testing / app.py

heerk30

Update app.py

c9cf145 verified about 1 month ago

raw

history blame contribute delete

2.04 kB

	import gradio as gr
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_bytes
	from vector_store import add_document, query_documents
	import io

	def ocr_file(file):
	# If file is None, return empty string
	if file is None:
	return ""
	# If file has a 'read' attribute, it's a file-like object
	if hasattr(file, "read"):
	if file.name.lower().endswith('.pdf'):
	file_bytes = file.read()
	pages = convert_from_bytes(file_bytes)
	text = "\n".join([pytesseract.image_to_string(page) for page in pages])
	else:
	image = Image.open(file)
	text = pytesseract.image_to_string(image)
	add_document(getattr(file, "name", "uploaded_file"), text, metadata={"source": getattr(file, "name", "uploaded_file")})
	return text
	# If file is a string (path), open it
	elif isinstance(file, str):
	if file.lower().endswith('.pdf'):
	with open(file, "rb") as f:
	pages = convert_from_bytes(f.read())
	text = "\n".join([pytesseract.image_to_string(page) for page in pages])
	else:
	image = Image.open(file)
	text = pytesseract.image_to_string(image)
	add_document(file, text, metadata={"source": file})
	return text
	else:
	return "Unsupported file type."

	def semantic_search(query):
	results = query_documents(query)
	return str(results)

	with gr.Blocks() as demo:
	gr.Markdown("# Document OCR & Semantic Search")
	with gr.Tab("Upload & OCR"):
	file_input = gr.File(label="Upload PDF or Image")
	ocr_output = gr.Textbox(label="OCR Result")
	file_input.change(ocr_file, inputs=file_input, outputs=ocr_output)
	with gr.Tab("Semantic Search"):
	query_input = gr.Textbox(label="Search Query")
	search_output = gr.Textbox(label="Search Results")
	query_input.change(semantic_search, inputs=query_input, outputs=search_output)

	demo.launch(server_name="0.0.0.0", server_port=7860)