talhashoaib's picture
Update app.py
6c75a17 verified
raw
history blame
1.69 kB
import gradio as gr
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import shutil
def extract_text_debug(file_path):
logs = []
text = ""
# Debug paths
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
# Try pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
t = page.extract_text() or ""
text += t
if text.strip():
logs.append("βœ… Extracted text using pdfplumber")
return text[:800] + "\n\n---\n" + "\n".join(logs)
else:
logs.append("⚠️ pdfplumber gave empty text, trying OCR…")
except Exception as e:
logs.append(f"❌ pdfplumber failed: {e}")
# OCR fallback
try:
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
ocr_text = [pytesseract.image_to_string(img) for img in images[:2]]
text = "\n".join(ocr_text)
if text.strip():
logs.append("βœ… OCR worked via pdf2image + Tesseract")
else:
logs.append("⚠️ OCR returned empty text")
except Exception as e:
logs.append(f"❌ OCR fallback failed: {e}")
return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs)
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ PDF Extractor Debug")
inp = gr.File(file_types=[".pdf"], type="filepath")
out = gr.Textbox(lines=20, label="Text + Debug Logs")
inp.change(extract_text_debug, inputs=inp, outputs=out)
if __name__ == "__main__":
demo.launch()