Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pdfplumber | |
from pdf2image import convert_from_path | |
import pytesseract | |
import shutil | |
def extract_text_debug(file_path): | |
logs = [] | |
text = "" | |
# Debug paths | |
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}") | |
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}") | |
# Try pdfplumber | |
try: | |
with pdfplumber.open(file_path) as pdf: | |
for page in pdf.pages: | |
t = page.extract_text() or "" | |
text += t | |
if text.strip(): | |
logs.append("β Extracted text using pdfplumber") | |
return text[:800] + "\n\n---\n" + "\n".join(logs) | |
else: | |
logs.append("β οΈ pdfplumber gave empty text, trying OCRβ¦") | |
except Exception as e: | |
logs.append(f"β pdfplumber failed: {e}") | |
# OCR fallback | |
try: | |
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin") | |
ocr_text = [pytesseract.image_to_string(img) for img in images[:2]] | |
text = "\n".join(ocr_text) | |
if text.strip(): | |
logs.append("β OCR worked via pdf2image + Tesseract") | |
else: | |
logs.append("β οΈ OCR returned empty text") | |
except Exception as e: | |
logs.append(f"β OCR fallback failed: {e}") | |
return (text[:800] if text.strip() else "β No text extracted") + "\n\n---\n" + "\n".join(logs) | |
with gr.Blocks() as demo: | |
gr.Markdown("# π PDF Extractor Debug") | |
inp = gr.File(file_types=[".pdf"], type="filepath") | |
out = gr.Textbox(lines=20, label="Text + Debug Logs") | |
inp.change(extract_text_debug, inputs=inp, outputs=out) | |
if __name__ == "__main__": | |
demo.launch() | |