Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, Query, HTTPException | |
from extractous import Extractor, TesseractOcrConfig | |
from bs4 import BeautifulSoup | |
app = FastAPI() | |
def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")): | |
if not link.startswith(("http://", "https://")): | |
raise HTTPException(status_code=400, detail="Invalid URL format") | |
extractor = Extractor().set_ocr_config(TesseractOcrConfig()) | |
extractor = extractor.set_xml_output(True) | |
content, metadata = extractor.extract_url_to_string(link) | |
soup = BeautifulSoup(content, 'html.parser') | |
pages = soup.find_all('div', class_='page') | |
pages_text = [p.get_text() for p in pages] | |
return {"received_link": link, "content": pages_text} | |