File size: 6,371 Bytes
28c2a3d
 
 
 
 
 
9b20476
 
 
28c2a3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b20476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from base_class import AbstractPDFParser
import pickle
from scipdf_utils import parse_pdf_to_dict


class GrobidSciPDFPaser(AbstractPDFParser):
    # import pysbd
    # seg_en = pysbd.Segmenter(language="en", clean=False)
    # seg_chinese = pysbd.Segmenter(language="zh", clean=False)

    def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
        """Initialize the PDF parser

            Args:
                pdf_link: link to the PDF file, the pdf link can be a web link or local file path
                metadata: metadata of the PDF file, like authors, title, abstract, etc.
                paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together
                split_paragraphs: dict of section name and corresponding list of split paragraphs
        """
        super().__init__(db_name=db_name)
        self.db_name = db_name
        self.pdf_link = pdf_link
        self.pdf = None
        self.metadata = {}
        self.flattn_paragraphs = None
        self.split_paragraphs = None
        self.short_thereshold = short_thereshold
        self.parse_pdf()

    def _contact_too_short_paragraphs(self, ):
        """Contact too short paragraphs or discard them"""
        for i, section in enumerate(self.split_paragraphs):
            # section_name = section['heading']
            paragraphs = section['texts']
            new_paragraphs = []
            for paragraph in paragraphs:
                if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0:
                    if len(new_paragraphs) != 0:
                        new_paragraphs[-1] += paragraph
                    else:
                        new_paragraphs.append(paragraph)
                else:
                    new_paragraphs.append(paragraph)
            self.split_paragraphs[i]['texts'] = new_paragraphs

    @staticmethod
    def _find_largest_font_string(file_name, search_string):
        search_string = search_string.strip()
        max_font_size = -1
        page_number = -1
        import PyPDF2
        from pdfminer.high_level import extract_pages
        from pdfminer.layout import LTTextContainer, LTChar
        try: 
            with open(file_name, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                for index, page_layout in enumerate(extract_pages(file_name)):
                    for element in page_layout:
                        if isinstance(element, LTTextContainer):
                            for text_line in element:
                                if search_string in text_line.get_text():
                                    for character in text_line:
                                        if isinstance(character, LTChar):
                                            if character.size > max_font_size:
                                                max_font_size = character.size
                                                page_number = index
            return page_number + 1 if page_number != -1 else -1
        except Exception as e:
            return -1
        

    def _find_section_page(self, section_name) -> None:
        return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name)

    def _retrive_or_parse(self, ):
        """Return pdf dict from cache if present, otherwise parse the pdf"""
        db_name = self.db_name
        if (self.pdf_link, db_name) not in self.db_cache.keys():
            self.db_cache[(self.pdf_link, db_name)
                          ] = parse_pdf_to_dict(self.pdf_link)
            with open(self.db_cache_path, "wb") as db_cache_file:
                pickle.dump(self.db_cache, db_cache_file)
        return self.db_cache[(self.pdf_link, db_name)]

    @staticmethod
    def _check_chinese(text) -> None:
        return any(u'\u4e00' <= char <= u'\u9fff' for char in text)

    def parse_pdf(self) -> None:
        """Parse the PDF file
        """
        article_dict = self._retrive_or_parse()
        self.article_dict = article_dict
        self._get_metadata()
        self.split_paragraphs = self.get_split_paragraphs()
        self._contact_too_short_paragraphs()

        self.flattn_paragraphs = self.get_paragraphs()

    def get_paragraphs(self) -> None:
        """Get the paragraphs of the PDF file
        """
        paragraphs = []
        self.content2section = {}
        for section in self.split_paragraphs:
            # paragraphs+=[section["heading"]]
            paragraphs += section["texts"]
            for para in section["texts"]:
                self.content2section[para] = section["heading"]
        return paragraphs

    def _get_metadata(self) -> None:
        for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]:
            self.metadata[meta] = self.article_dict[meta]
        self.section_names = [section["heading"]
                              for section in self.article_dict['sections']]
        self.section_names2page = {}
        for section_name in self.section_names:
            section_page_index = self._find_section_page(section_name)
            self.section_names2page.update({section_name: section_page_index})
        self.section_names_with_page_index = [section_name + " (Page {})".format(
            self.section_names2page[section_name]) for section_name in self.section_names]

    def get_split_paragraphs(self, ) -> None:
        section_pair_list = []
        for section in self.article_dict['sections']:
            section_pair_list.append({
                "heading": section["heading"],
                "texts": section["all_paragraphs"],
            })
        return section_pair_list

    # @staticmethod
    # def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
    #     """
    #     split based on the some magic rules
    #     """
    #     import pysbd
    #     for section_pair in section_pair_list:
    #         if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
    #             seg = GrobidSciPDFPaser.seg_chinese
    #         else:
    #             seg = GrobidSciPDFPaser.seg_en
    #         section_pair["texts"] = seg.segment(section_pair["texts"])
    #         section_pair["texts"] = [
    #             para for para in section_pair["text"] if len(para) > 2]
    #     return section_pair_list