Archan commited on
Commit
d085c50
1 Parent(s): f6dab54

Create new file

Browse files
Files changed (1) hide show
  1. get_pages.py +21 -0
get_pages.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.high_level import extract_text, extract_pages
2
+ from pdfminer.layout import LTTextContainer
3
+ from preprocess import pre_process
4
+
5
+
6
+ def get_pages(filename, start_page=0, end_page=0):
7
+ page_number = []
8
+ for i in range(start_page, end_page+1):
9
+ page_number.append(i-1)
10
+ print(page_number)
11
+ #filename = str(paper.title)+'.pdf'
12
+ pages = extract_pages(filename, page_numbers=page_number)
13
+
14
+ content = ""
15
+ for page_layout in pages:
16
+ for element in page_layout:
17
+ if isinstance(element, LTTextContainer):
18
+ content = content+element.get_text()
19
+ content = pre_process(content)
20
+
21
+ return content