File size: 658 Bytes
d085c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer
from preprocess import pre_process


def get_pages(filename, start_page=0, end_page=0):
    page_number = []
    for i in range(start_page, end_page+1):
        page_number.append(i-1)
    print(page_number)
    #filename = str(paper.title)+'.pdf'
    pages = extract_pages(filename, page_numbers=page_number)

    content = ""
    for page_layout in pages:
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                content = content+element.get_text()
    content = pre_process(content)

    return content