File size: 6,139 Bytes
f7161fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from types import SimpleNamespace

import pdfplumber
from langchain.docstore.document import Document


def prepare_table_config(crop_page):
    """Prepare table查找边界, 要求page为原始page

    From https://github.com/jsvine/pdfplumber/issues/242
    """
    page = crop_page.root_page  # root/parent
    cs = page.curves + page.edges

    def curves_to_edges():
        """See https://github.com/jsvine/pdfplumber/issues/127"""
        edges = []
        for c in cs:
            edges += pdfplumber.utils.rect_to_edges(c)
        return edges

    edges = curves_to_edges()
    return {
        "vertical_strategy": "explicit",
        "horizontal_strategy": "explicit",
        "explicit_vertical_lines": edges,
        "explicit_horizontal_lines": edges,
        "intersection_y_tolerance": 10,
    }


def get_text_outside_table(crop_page):
    ts = prepare_table_config(crop_page)
    if len(ts["explicit_vertical_lines"]) == 0 or len(ts["explicit_horizontal_lines"]) == 0:
        return crop_page

    ### Get the bounding boxes of the tables on the page.
    bboxes = [table.bbox for table in crop_page.root_page.find_tables(table_settings=ts)]

    def not_within_bboxes(obj):
        """Check if the object is in any of the table's bbox."""

        def obj_in_bbox(_bbox):
            """See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404"""
            v_mid = (obj["top"] + obj["bottom"]) / 2
            h_mid = (obj["x0"] + obj["x1"]) / 2
            x0, top, x1, bottom = _bbox
            return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)

        return not any(obj_in_bbox(__bbox) for __bbox in bboxes)

    return crop_page.filter(not_within_bboxes)


# 请使用 LaTeX 表达公式,行内公式以 $ 包裹,行间公式以 $$ 包裹

extract_words = lambda page: page.extract_words(keep_blank_chars=True, y_tolerance=0, x_tolerance=1,
                                                extra_attrs=["fontname", "size", "object_type"])


# dict_keys(['text', 'x0', 'x1', 'top', 'doctop', 'bottom', 'upright', 'direction', 'fontname', 'size'])

def get_title_with_cropped_page(first_page):
    title = []  # 处理标题
    x0, top, x1, bottom = first_page.bbox  # 获取页面边框

    for word in extract_words(first_page):
        word = SimpleNamespace(**word)

        if word.size >= 14:
            title.append(word.text)
            title_bottom = word.bottom
        elif word.text == "Abstract":  # 获取页面abstract
            top = word.top

    user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0, title_bottom, x1, top)))]
    # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
    return title, user_info, first_page.within_bbox((x0, top, x1, bottom))


def get_column_cropped_pages(pages, two_column=True):
    new_pages = []
    for page in pages:
        if two_column:
            left = page.within_bbox((0, 0, page.width / 2, page.height), relative=True)
            right = page.within_bbox((page.width / 2, 0, page.width, page.height), relative=True)
            new_pages.append(left)
            new_pages.append(right)
        else:
            new_pages.append(page)

    return new_pages


def parse_pdf(filename, two_column=True):
    with pdfplumber.open(filename) as pdf:
        title, user_info, first_page = get_title_with_cropped_page(pdf.pages[0])
        new_pages = get_column_cropped_pages([first_page] + pdf.pages[1:], two_column)

        chapters = []
        # tuple (chapter_name, [pageid] (start,stop), chapter_text)
        create_chapter = lambda page_start, name_top, name_bottom: SimpleNamespace(
            name=[],
            name_top=name_top,
            name_bottom=name_bottom,
            record_chapter_name=True,

            page_start=page_start,
            page_stop=None,

            text=[],
        )
        cur_chapter = None

        # 按页遍历PDF文档
        for idx, page in enumerate(new_pages):
            page = get_text_outside_table(page)

            # 按行遍历页面文本
            for word in extract_words(page):
                word = SimpleNamespace(**word)

                # 检查行文本是否以12号字体打印,如果是,则将其作为新章节开始
                if word.size >= 11:  # 出现chapter name
                    if cur_chapter is None:
                        cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
                    elif not cur_chapter.record_chapter_name or (
                            cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
                        # 不再继续写chapter name
                        cur_chapter.page_stop = page.page_number  # stop id
                        chapters.append(cur_chapter)
                        # 重置当前chapter信息
                        cur_chapter = create_chapter(page.page_number, word.top, word.bottom)

                    # print(word.size, word.top, word.bottom, word.text)
                    cur_chapter.name.append(word.text)
                else:
                    cur_chapter.record_chapter_name = False  # chapter name 结束
                    cur_chapter.text.append(word.text)
        else:
            # 处理最后一个章节
            cur_chapter.page_stop = page.page_number  # stop id
            chapters.append(cur_chapter)

        for i in chapters:
            print(f"section: {i.name} pages:{i.page_start, i.page_stop} word-count:{len(i.text)}")
            print(" ".join(i.text))

    title = " ".join(title)
    user_info = " ".join(user_info)
    text = f"Article Title: {title}, Information:{user_info}\n"
    for idx, chapter in enumerate(chapters):
        chapter.name = " ".join(chapter.name)
        text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"

    return Document(page_content=text, metadata={"title": title})


if __name__ == '__main__':
    # Test code
    z = parse_pdf("./build/test.pdf")
    print(z["user_info"])
    print(z["title"])