Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,368 Bytes
b288805 cedec09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from typing import List, Union, Optional
import json
from indexify_extractor_sdk import Content, Extractor, Feature
from pydantic import BaseModel, Field
from .utils.tt_module import get_tables
import fitz
import tempfile
class PDFExtractorConfig(BaseModel):
output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"])
class PDFExtractor(Extractor):
name = "tensorlake/pdf-extractor"
description = "PDF Extractor for Texts, Images & Tables"
system_dependencies = ["poppler-utils"]
input_mime_types = ["application/pdf"]
def __init__(self):
super(PDFExtractor, self).__init__()
def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]:
contents = []
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
inputtmpfile.write(content.data)
inputtmpfile.flush()
doc = fitz.open(inputtmpfile.name)
for i in range(len(doc)):
page = doc[i]
if "text" in params.output_types:
page_text = page.get_text()
feature = Feature.metadata(value={"type": "text", "page": i+1})
contents.append(Content.from_text(page_text, features=[feature]))
if "image" in params.output_types:
image_list = page.get_images()
for img in image_list:
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
pix = fitz.Pixmap(fitz.csRGB, pix)
feature = Feature.metadata({"type": "image", "page": i+1})
contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature]))
if "table" in params.output_types:
tables = get_tables(content.data)
for page, content in tables.items():
feature = Feature.metadata({"type": "table", "page": int(page)})
contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature]))
return contents
def sample_input(self) -> Content:
return self.sample_scientific_pdf() |