Spaces:
Running
Running
import gradio as gr | |
import pytesseract | |
from PIL import Image | |
from pdf2image import convert_from_bytes | |
from vector_store import add_document, query_documents | |
import io | |
def ocr_file(file): | |
# If file is None, return empty string | |
if file is None: | |
return "" | |
# If file has a 'read' attribute, it's a file-like object | |
if hasattr(file, "read"): | |
if file.name.lower().endswith('.pdf'): | |
file_bytes = file.read() | |
pages = convert_from_bytes(file_bytes) | |
text = "\n".join([pytesseract.image_to_string(page) for page in pages]) | |
else: | |
image = Image.open(file) | |
text = pytesseract.image_to_string(image) | |
add_document(getattr(file, "name", "uploaded_file"), text, metadata={"source": getattr(file, "name", "uploaded_file")}) | |
return text | |
# If file is a string (path), open it | |
elif isinstance(file, str): | |
if file.lower().endswith('.pdf'): | |
with open(file, "rb") as f: | |
pages = convert_from_bytes(f.read()) | |
text = "\n".join([pytesseract.image_to_string(page) for page in pages]) | |
else: | |
image = Image.open(file) | |
text = pytesseract.image_to_string(image) | |
add_document(file, text, metadata={"source": file}) | |
return text | |
else: | |
return "Unsupported file type." | |
def semantic_search(query): | |
results = query_documents(query) | |
return str(results) | |
with gr.Blocks() as demo: | |
gr.Markdown("# Document OCR & Semantic Search") | |
with gr.Tab("Upload & OCR"): | |
file_input = gr.File(label="Upload PDF or Image") | |
ocr_output = gr.Textbox(label="OCR Result") | |
file_input.change(ocr_file, inputs=file_input, outputs=ocr_output) | |
with gr.Tab("Semantic Search"): | |
query_input = gr.Textbox(label="Search Query") | |
search_output = gr.Textbox(label="Search Results") | |
query_input.change(semantic_search, inputs=query_input, outputs=search_output) | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |