Spaces:
Runtime error
Runtime error
File size: 1,690 Bytes
854b899 c95e708 854b899 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import io
import os
import pypdf
from collections import Counter
def limit_pagenumbers(filepath, pages_limit=20):
num_pages = 0
is_encrypted = False
with open(filepath, "rb") as file:
reader = pypdf.PdfReader(file)
num_pages = len(reader.pages)
is_encrypted = reader.is_encrypted
if num_pages > pages_limit or is_encrypted:
# Create a PDF writer object
writer = pypdf.PdfWriter()
for page_num in range(num_pages):
# Add to writer if page number <= 20
if page_num < pages_limit:
writer.add_page(reader.pages[page_num])
# Create a temporary buffer to write the modified content
temp_buffer = io.BytesIO()
writer.write(temp_buffer)
if num_pages > pages_limit or is_encrypted:
# Write the limited pages to the original file
with open(filepath, "wb") as output_file:
# Set the buffer position back to the beginning
temp_buffer.seek(0)
# Write the buffer content to the output file
output_file.write(temp_buffer.read())
def majority_vote_dicts(dicts):
combined_dict = {}
for d in dicts:
for k, v in d.items():
if k in ['page_number', 'text_sequence']:
continue
if k not in combined_dict:
combined_dict[k] = Counter()
if type(v) == str:
combined_dict[k][v] += 1
else:
combined_dict[k][str(v)] += 1
result_dict = {}
for k, v in combined_dict.items():
result_dict[k] = v.most_common(1)[0][0]
return result_dict
|