File size: 1,690 Bytes
854b899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c95e708
 
 
 
854b899
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import io
import os
import pypdf
from collections import Counter

def limit_pagenumbers(filepath, pages_limit=20):
    num_pages = 0
    is_encrypted = False
    with open(filepath, "rb") as file:
        reader = pypdf.PdfReader(file)
        num_pages = len(reader.pages)
        is_encrypted = reader.is_encrypted
        if num_pages > pages_limit or is_encrypted:
            # Create a PDF writer object
            writer = pypdf.PdfWriter()
            for page_num in range(num_pages):
                # Add to writer if page number <= 20
                if page_num < pages_limit:
                    writer.add_page(reader.pages[page_num])
            # Create a temporary buffer to write the modified content
            temp_buffer = io.BytesIO()
            writer.write(temp_buffer)

    if num_pages > pages_limit or is_encrypted:
        # Write the limited pages to the original file
        with open(filepath, "wb") as output_file:
            # Set the buffer position back to the beginning
            temp_buffer.seek(0)
            # Write the buffer content to the output file
            output_file.write(temp_buffer.read())

def majority_vote_dicts(dicts):
    combined_dict = {}
    for d in dicts:
        for k, v in d.items():
            if k in ['page_number', 'text_sequence']:
                continue
            if k not in combined_dict:
                combined_dict[k] = Counter()
            if type(v) == str:
                combined_dict[k][v] += 1
            else:
                combined_dict[k][str(v)] += 1
    result_dict = {}
    for k, v in combined_dict.items():
        result_dict[k] = v.most_common(1)[0][0]
    return result_dict