File size: 1,464 Bytes
9db6155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270f60d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from marker.convert import convert_single_pdf
from marker.models import load_all_models
import tempfile
from indexify_extractor_sdk import Content, Extractor, Feature

from pydantic import BaseModel
from typing import Optional, Literal, List, Union

class MarkdownExtractorConfig(BaseModel):
    max_pages: Optional[int] = None
    langs: Optional[str] = None
    batch_multiplier: Optional[int] = 2

class MarkdownExtractor(Extractor):
    name = "tensorlake/marker"
    description = "Markdown Extractor for PDFs"
    system_dependencies = []
    input_mime_types = ["application/pdf"]

    def __init__(self):
        super(MarkdownExtractor, self).__init__()
        self.model_lst = load_all_models()

    def extract(self, content: Content, params: MarkdownExtractorConfig) -> List[Union[Feature, Content]]:
        contents = []
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
            inputtmpfile.write(content.data)
            inputtmpfile.flush()

            full_text, images, out_meta = convert_single_pdf(inputtmpfile.name, self.model_lst, max_pages=params.max_pages, langs=params.langs, batch_multiplier=params.batch_multiplier)
            
            feature = Feature.metadata(value=out_meta, name="text")
            contents.append(Content.from_text(full_text, features=[feature]))

        return contents

    def sample_input(self) -> Content:
        return self.sample_scientific_pdf()