File size: 4,457 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from collections import Counter
from statistics import mean, median

from marker.schema.block import Span, Line
from marker.schema.page import Page
import re
from typing import List


def is_code_linelen(lines, thresh=80):
    # Decide based on chars per newline threshold
    total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
    total_newlines = max(len(lines) - 1, 1)

    if total_alnum_chars == 0:
        return False

    ratio = total_alnum_chars / total_newlines
    return ratio < thresh


def comment_count(lines):
    pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|<!--|%|%{|\(\*)")
    return sum([1 for line in lines if pattern.match(line)])


def identify_code_blocks(pages: List[Page]):
    code_block_count = 0
    font_sizes = []
    line_heights = []
    for page in pages:
        font_sizes += page.get_font_sizes()
        line_heights += page.get_line_heights()

    avg_font_size = None
    avg_line_height = None
    if len(font_sizes) > 0:
        avg_line_height = median(line_heights)
        avg_font_size = mean(font_sizes)

    for page in pages:
        for block in page.blocks:
            if block.block_type != "Text":
                last_block = block
                continue

            # Ensure we have lines and spans
            if len(block.lines) == 0:
                continue
            if sum([len(line.spans) for line in block.lines]) == 0:
                continue

            min_start = block.get_min_line_start()

            is_indent = []
            line_fonts = []
            line_font_sizes = []
            block_line_heights = []
            for line in block.lines:
                line_fonts += [span.font for span in line.spans]
                line_font_sizes += [span.font_size for span in line.spans]
                block_line_heights.append(line.bbox[3] - line.bbox[1])

                is_indent.append(line.bbox[0] > min_start)

            comment_lines = comment_count([line.prelim_text for line in block.lines])
            is_code = [
                len(block.lines) > 3,
                is_code_linelen(block.lines),
                sum(is_indent) + comment_lines > len(block.lines) * .7, # Indentation and comments are a majority
            ]

            if avg_font_size is not None:
                font_checks = [
                    mean(line_font_sizes) <= avg_font_size * .8, # Lower than average font size and line height
                    mean(block_line_heights) < avg_line_height * .8
                ]
                is_code += font_checks

            if all(is_code):
                code_block_count += 1
                block.block_type = "Code"

    return code_block_count


def indent_blocks(pages: List[Page]):
    span_counter = 0
    for page in pages:
        for block in page.blocks:
            if block.block_type != "Code":
                continue

            lines = []
            min_left = 1000  # will contain x- coord of column 0
            col_width = 0  # width of 1 char
            for line in block.lines:
                text = ""
                min_left = min(line.bbox[0], min_left)
                for span in line.spans:
                    if col_width == 0 and len(span.text) > 0:
                        col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
                    text += span.text
                lines.append((line.bbox, text))

            block_text = ""
            blank_line = False
            for line in lines:
                text = line[1]
                if col_width == 0:
                    prefix = ""
                else:
                    prefix = " " * int((line[0][0] - min_left) / col_width)
                current_line_blank = len(text.strip()) == 0
                if blank_line and current_line_blank:
                    # Don't put multiple blank lines in a row
                    continue

                block_text += prefix + text + "\n"
                blank_line = current_line_blank

            new_span = Span(
                text=block_text,
                bbox=block.bbox,
                span_id=f"{span_counter}_fix_code",
                font=block.lines[0].spans[0].font,
                font_weight=block.lines[0].spans[0].font_weight,
                font_size=block.lines[0].spans[0].font_size,
            )
            span_counter += 1
            block.lines = [Line(spans=[new_span], bbox=block.bbox)]