Commit
β’
952523e
1
Parent(s):
e71f5a4
percentage
Browse files
app.py
CHANGED
@@ -21,25 +21,27 @@ CPU_COUNT = multiprocessing.cpu_count()
|
|
21 |
MAX_WORKERS = min(32, CPU_COUNT) # Use CPU count directly for processes
|
22 |
|
23 |
|
24 |
-
def process_pdf(pdf_file,
|
25 |
try:
|
26 |
pdf_path = pdf_file.name
|
27 |
doc = fitz.open(pdf_path)
|
28 |
total_pages = len(doc)
|
29 |
|
30 |
-
pages_to_convert = (
|
31 |
-
|
32 |
-
|
|
|
|
|
33 |
selected_pages = (
|
34 |
sorted(random.sample(range(total_pages), pages_to_convert))
|
35 |
-
if
|
36 |
else range(total_pages)
|
37 |
)
|
38 |
|
39 |
images = []
|
40 |
for page_num in selected_pages:
|
41 |
page = doc[page_num]
|
42 |
-
pix = page.get_pixmap(
|
43 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
44 |
image_path = os.path.join(
|
45 |
temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
|
@@ -53,7 +55,7 @@ def process_pdf(pdf_file, sample_size, temp_dir):
|
|
53 |
return [], f"Error processing {pdf_file.name}: {str(e)}", 0
|
54 |
|
55 |
|
56 |
-
def pdf_to_images(pdf_files,
|
57 |
if not os.path.exists(temp_dir):
|
58 |
os.makedirs(temp_dir)
|
59 |
|
@@ -66,7 +68,7 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
|
|
66 |
|
67 |
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
68 |
future_to_pdf = {
|
69 |
-
executor.submit(process_pdf, pdf,
|
70 |
for pdf in pdf_files
|
71 |
}
|
72 |
|
@@ -103,7 +105,7 @@ def get_size_category(num_images):
|
|
103 |
|
104 |
def process_pdfs(
|
105 |
pdf_files,
|
106 |
-
|
107 |
hf_repo,
|
108 |
create_zip,
|
109 |
private_repo,
|
@@ -134,7 +136,7 @@ def process_pdfs(
|
|
134 |
os.makedirs(images_dir)
|
135 |
|
136 |
progress(0, desc="Starting PDF processing")
|
137 |
-
images, message = pdf_to_images(pdf_files,
|
138 |
|
139 |
# Create a new directory for sampled images
|
140 |
sampled_images_dir = os.path.join(temp_dir, "sampled_images")
|
@@ -195,7 +197,9 @@ def process_pdfs(
|
|
195 |
hf_repo=hf_repo,
|
196 |
num_images=len(images),
|
197 |
num_pdfs=len(pdf_files),
|
198 |
-
sample_size=
|
|
|
|
|
199 |
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
200 |
)
|
201 |
|
@@ -204,7 +208,9 @@ def process_pdfs(
|
|
204 |
hf_repo=hf_repo,
|
205 |
num_images=len(images),
|
206 |
num_pdfs=len(pdf_files),
|
207 |
-
sample_size=
|
|
|
|
|
208 |
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
209 |
size_category=size_category,
|
210 |
)
|
@@ -248,10 +254,13 @@ with gr.Blocks() as demo:
|
|
248 |
file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"]
|
249 |
)
|
250 |
with gr.Row():
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
255 |
)
|
256 |
hf_repo = gr.Textbox(
|
257 |
label="Hugging Face Repo",
|
@@ -269,7 +278,7 @@ with gr.Blocks() as demo:
|
|
269 |
submit_button = gr.Button("Convert PDFs to page images")
|
270 |
submit_button.click(
|
271 |
process_pdfs,
|
272 |
-
inputs=[pdf_files,
|
273 |
outputs=[output_gallery, download_button, status_text],
|
274 |
)
|
275 |
|
|
|
21 |
MAX_WORKERS = min(32, CPU_COUNT) # Use CPU count directly for processes
|
22 |
|
23 |
|
24 |
+
def process_pdf(pdf_file, sample_percentage, temp_dir):
|
25 |
try:
|
26 |
pdf_path = pdf_file.name
|
27 |
doc = fitz.open(pdf_path)
|
28 |
total_pages = len(doc)
|
29 |
|
30 |
+
pages_to_convert = int(total_pages * (sample_percentage / 100))
|
31 |
+
pages_to_convert = max(
|
32 |
+
1, min(pages_to_convert, total_pages)
|
33 |
+
) # Ensure at least one page and not more than total pages
|
34 |
+
|
35 |
selected_pages = (
|
36 |
sorted(random.sample(range(total_pages), pages_to_convert))
|
37 |
+
if 0 < sample_percentage < 100
|
38 |
else range(total_pages)
|
39 |
)
|
40 |
|
41 |
images = []
|
42 |
for page_num in selected_pages:
|
43 |
page = doc[page_num]
|
44 |
+
pix = page.get_pixmap() # Remove the Matrix scaling
|
45 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
46 |
image_path = os.path.join(
|
47 |
temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
|
|
|
55 |
return [], f"Error processing {pdf_file.name}: {str(e)}", 0
|
56 |
|
57 |
|
58 |
+
def pdf_to_images(pdf_files, sample_percentage, temp_dir, progress=gr.Progress()):
|
59 |
if not os.path.exists(temp_dir):
|
60 |
os.makedirs(temp_dir)
|
61 |
|
|
|
68 |
|
69 |
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
70 |
future_to_pdf = {
|
71 |
+
executor.submit(process_pdf, pdf, sample_percentage, temp_dir): pdf
|
72 |
for pdf in pdf_files
|
73 |
}
|
74 |
|
|
|
105 |
|
106 |
def process_pdfs(
|
107 |
pdf_files,
|
108 |
+
sample_percentage,
|
109 |
hf_repo,
|
110 |
create_zip,
|
111 |
private_repo,
|
|
|
136 |
os.makedirs(images_dir)
|
137 |
|
138 |
progress(0, desc="Starting PDF processing")
|
139 |
+
images, message = pdf_to_images(pdf_files, sample_percentage, images_dir)
|
140 |
|
141 |
# Create a new directory for sampled images
|
142 |
sampled_images_dir = os.path.join(temp_dir, "sampled_images")
|
|
|
197 |
hf_repo=hf_repo,
|
198 |
num_images=len(images),
|
199 |
num_pdfs=len(pdf_files),
|
200 |
+
sample_size=sample_percentage
|
201 |
+
if sample_percentage > 0
|
202 |
+
else "All pages",
|
203 |
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
204 |
)
|
205 |
|
|
|
208 |
hf_repo=hf_repo,
|
209 |
num_images=len(images),
|
210 |
num_pdfs=len(pdf_files),
|
211 |
+
sample_size=sample_percentage
|
212 |
+
if sample_percentage > 0
|
213 |
+
else "All pages",
|
214 |
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
215 |
size_category=size_category,
|
216 |
)
|
|
|
254 |
file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"]
|
255 |
)
|
256 |
with gr.Row():
|
257 |
+
sample_percentage = gr.Slider(
|
258 |
+
minimum=0,
|
259 |
+
maximum=100,
|
260 |
+
value=100,
|
261 |
+
step=1,
|
262 |
+
label="Percentage of pages to sample per PDF",
|
263 |
+
info="0% for no sampling (all pages), 100% for all pages",
|
264 |
)
|
265 |
hf_repo = gr.Textbox(
|
266 |
label="Hugging Face Repo",
|
|
|
278 |
submit_button = gr.Button("Convert PDFs to page images")
|
279 |
submit_button.click(
|
280 |
process_pdfs,
|
281 |
+
inputs=[pdf_files, sample_percentage, hf_repo, create_zip, private_repo],
|
282 |
outputs=[output_gallery, download_button, status_text],
|
283 |
)
|
284 |
|