File size: 5,945 Bytes
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21add
d9b7d2f
5f21add
 
 
 
 
 
d9b7d2f
5f21add
d9b7d2f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple


import re

class TextExtractor:
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
        """
        This function return the fonts information inside the pdf such as size and type.

        Args:
            doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
            granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.

        Raises:
            ValueError: Raises Value Error if there are no font detected

        Returns:
            Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
        """
        styles = {}
        font_counts = {}

        for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
            identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
            styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
            font_counts[identifier] = font_counts.get(identifier, 0) + 1
        font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)

        if not font_counts:
            raise ValueError("Zero discriminating fonts found!")

        return font_counts, styles

    @staticmethod
    def get_font_tags(font_counts, styles):
        """
        _summary_

        Args:
            font_counts (_type_): _description_
            styles (_type_): _description_

        Returns:
            _type_: _description_
        """
        p_size = styles[font_counts[0][0]]['size']
        # sorting the font sizes high to low, so that we can append the right integer to each tag 
        font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
        size_tag = {p_size: "<p>"}
        for i, size in enumerate(font_sizes):
            if size > p_size:
                size_tag[size] = f"<h{i+1}>"
            elif size < p_size:
                size_tag[size] = f"<s{i+1}>"
        return size_tag
    
    @staticmethod
    def assign_tags(doc, size_tag):
        """
        Scrapes headers & paragraphs from PDF and return texts with element tags.

        Args:
            doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
            size_tag (dict): Textual element tags for each size.
        Returns:
            list: Texts with pre-prended element tags
        """
        texts = []
        previous_s = {}
        block_string = ""
        for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
            block_string = ""
            for l in b["lines"]:
                for s in l["spans"]:
                    text = re.sub(r"[^\w\s]", '', s["text"]).strip()
                    if text:
                        if not previous_s: # First Span
                            previous_s = s
                            block_string = size_tag[s['size']] + s['text']                       
                        elif s['size'] == previous_s['size']:
                            if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
                                block_string = size_tag[s['size']] + s['text']    
                            else:  # in the same block, so concatenate strings
                                block_string += f" {s['text']}"
                        else:
                            texts.append(block_string)
                            block_string = size_tag[s['size']] + s['text']
                        previous_s = s
                if block_string:
                    block_string += "|"
            # if block_string:
            texts.append(block_string)
        return texts
    
    @staticmethod
    def get_slides(texts):
        slides = {}
        section = []
        page = 1

        current_header = ""
        for text, next_text in zip(texts, texts[1:] + [None]):
            tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
            if tag_match:
                tag = tag_match.group()
                if tag == 'h1':
                    section = []
                    section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('h'): # non h1 headers
                    # Remove tag and pipes from the text  
                    section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('p'):
                    text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
                    for paragraph in text:
                        paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe 
                        paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
                        if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
                            section[-1][1] += f" {paragraph}"
                        elif paragraph: 
                            section.append([tag, paragraph])
                try:
                    if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
                        slides[f"Page {page}"] = section
                        page += 1
                except:
                    continue         
        return slides