Spaces:
Runtime error
Runtime error
import io | |
import os | |
import pypdf | |
from collections import Counter | |
def limit_pagenumbers(filepath, pages_limit=20): | |
num_pages = 0 | |
is_encrypted = False | |
with open(filepath, "rb") as file: | |
reader = pypdf.PdfReader(file) | |
num_pages = len(reader.pages) | |
is_encrypted = reader.is_encrypted | |
if num_pages > pages_limit or is_encrypted: | |
# Create a PDF writer object | |
writer = pypdf.PdfWriter() | |
for page_num in range(num_pages): | |
# Add to writer if page number <= 20 | |
if page_num < pages_limit: | |
writer.add_page(reader.pages[page_num]) | |
# Create a temporary buffer to write the modified content | |
temp_buffer = io.BytesIO() | |
writer.write(temp_buffer) | |
if num_pages > pages_limit or is_encrypted: | |
# Write the limited pages to the original file | |
with open(filepath, "wb") as output_file: | |
# Set the buffer position back to the beginning | |
temp_buffer.seek(0) | |
# Write the buffer content to the output file | |
output_file.write(temp_buffer.read()) | |
def majority_vote_dicts(dicts): | |
combined_dict = {} | |
for d in dicts: | |
for k, v in d.items(): | |
if k in ['page_number', 'text_sequence']: | |
continue | |
if k not in combined_dict: | |
combined_dict[k] = Counter() | |
combined_dict[k][v] += 1 | |
result_dict = {} | |
for k, v in combined_dict.items(): | |
result_dict[k] = v.most_common(1)[0][0] | |
return result_dict | |