davanstrien HF staff commited on
Commit
64cd544
1 Parent(s): d5b1786

reqs and app

Browse files
Files changed (2) hide show
  1. app.py +66 -44
  2. requirements.in +5 -4
app.py CHANGED
@@ -1,18 +1,21 @@
1
- import gradio as gr
2
  import os
3
  import random
 
4
  import tempfile
 
 
 
 
5
  from pdf2image import convert_from_path
6
  from PyPDF2 import PdfReader
7
- from huggingface_hub import create_repo, upload_folder, HfApi
8
 
9
 
10
- def pdf_to_images(pdf_files, sample_size, temp_dir):
11
  if not os.path.exists(temp_dir):
12
  os.makedirs(temp_dir)
13
-
14
  all_images = []
15
- for pdf_file in pdf_files:
16
  pdf_path = pdf_file.name
17
  pdf = PdfReader(pdf_path)
18
  total_pages = len(pdf.pages)
@@ -45,42 +48,60 @@ def pdf_to_images(pdf_files, sample_size, temp_dir):
45
  return all_images, f"Saved {len(all_images)} images to temporary directory"
46
 
47
 
48
- def process_pdfs(pdf_files, sample_size, hf_repo, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
49
  if not pdf_files:
50
- return None, "No PDF files uploaded."
51
 
52
  if oauth_token is None:
53
- return None, "Please log in to upload to Hugging Face."
 
 
 
 
 
54
 
55
  try:
56
- with tempfile.TemporaryDirectory() as temp_dir:
57
- images_dir = os.path.join(temp_dir, "images")
58
- os.makedirs(images_dir)
59
-
60
- images, message = pdf_to_images(pdf_files, sample_size, images_dir)
61
-
62
- if hf_repo:
63
- try:
64
- hf_api = HfApi(token=oauth_token.token)
65
- hf_api.create_repo(
66
- hf_repo,
67
- repo_type="dataset",
68
- )
69
- hf_api.upload_folder(
70
- folder_path=images_dir,
71
- repo_id=hf_repo,
72
- repo_type="dataset",
73
- path_in_repo="images",
74
- )
75
- message += (
76
- f"\nUploaded images to Hugging Face repo: {hf_repo}/images"
77
- )
78
- except Exception as e:
79
- message += f"\nFailed to upload to Hugging Face: {str(e)}"
80
-
81
- return images, message
 
 
 
 
 
82
  except Exception as e:
83
- return None, f"An error occurred: {str(e)}"
 
 
84
 
85
 
86
  # Define the Gradio interface
@@ -94,26 +115,27 @@ with gr.Blocks() as demo:
94
  gr.LoginButton(size="sm")
95
 
96
  with gr.Row():
97
- pdf_files = gr.File(file_count="multiple", label="Upload PDF(s)")
98
- sample_size = gr.Slider(
99
- minimum=0,
100
- maximum=50,
101
- step=1,
102
- value=0,
103
- label="Sample Size (0 for all pages)",
104
  )
105
  hf_repo = gr.Textbox(
106
  label="Hugging Face Repo", placeholder="username/repo-name"
107
  )
108
 
109
  output_gallery = gr.Gallery(label="Converted Images")
110
- status_text = gr.Textbox(label="Status")
 
111
 
112
  submit_button = gr.Button("Process PDFs")
113
  submit_button.click(
114
  process_pdfs,
115
  inputs=[pdf_files, sample_size, hf_repo],
116
- outputs=[output_gallery, status_text],
117
  )
118
 
119
  # Launch the app
 
 
1
  import os
2
  import random
3
+ import shutil
4
  import tempfile
5
+ import zipfile
6
+
7
+ import gradio as gr
8
+ from huggingface_hub import HfApi
9
  from pdf2image import convert_from_path
10
  from PyPDF2 import PdfReader
 
11
 
12
 
13
+ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
14
  if not os.path.exists(temp_dir):
15
  os.makedirs(temp_dir)
16
+ progress(0, desc="Starting conversion")
17
  all_images = []
18
+ for pdf_file in progress.tqdm(pdf_files, desc="Converting PDFs"):
19
  pdf_path = pdf_file.name
20
  pdf = PdfReader(pdf_path)
21
  total_pages = len(pdf.pages)
 
48
  return all_images, f"Saved {len(all_images)} images to temporary directory"
49
 
50
 
51
+ def process_pdfs(
52
+ pdf_files,
53
+ sample_size,
54
+ hf_repo,
55
+ oauth_token: gr.OAuthToken | None,
56
+ progress=gr.Progress(),
57
+ ):
58
  if not pdf_files:
59
+ return None, None, "No PDF files uploaded."
60
 
61
  if oauth_token is None:
62
+ gr.Info("Please log in to upload to Hugging Face.")
63
+ return (
64
+ None,
65
+ None,
66
+ "Not logged in to Hugging Face, please log in to upload to a Hugging Face dataset.",
67
+ )
68
 
69
  try:
70
+ temp_dir = tempfile.mkdtemp()
71
+ images_dir = os.path.join(temp_dir, "images")
72
+ os.makedirs(images_dir)
73
+
74
+ images, message = pdf_to_images(pdf_files, sample_size, images_dir)
75
+
76
+ # Create a zip file of the images
77
+ zip_path = os.path.join(temp_dir, "converted_images.zip")
78
+ with zipfile.ZipFile(zip_path, "w") as zipf:
79
+ progress(0, desc="Zipping images")
80
+ for image in progress.tqdm(images, desc="Zipping images"):
81
+ zipf.write(image, os.path.basename(image))
82
+
83
+ if hf_repo:
84
+ try:
85
+ hf_api = HfApi(token=oauth_token.token)
86
+ hf_api.create_repo(
87
+ hf_repo,
88
+ repo_type="dataset",
89
+ )
90
+ hf_api.upload_folder(
91
+ folder_path=images_dir,
92
+ repo_id=hf_repo,
93
+ repo_type="dataset",
94
+ path_in_repo="images",
95
+ )
96
+ message += f"\nUploaded images to Hugging Face repo: {hf_repo}/images"
97
+ except Exception as e:
98
+ message += f"\nFailed to upload to Hugging Face: {str(e)}"
99
+
100
+ return images, zip_path, message
101
  except Exception as e:
102
+ if "temp_dir" in locals():
103
+ shutil.rmtree(temp_dir)
104
+ return None, None, f"An error occurred: {str(e)}"
105
 
106
 
107
  # Define the Gradio interface
 
115
  gr.LoginButton(size="sm")
116
 
117
  with gr.Row():
118
+ pdf_files = gr.File(
119
+ file_count="multiple", label="Upload PDF(s)", file_types=["*.pdf"]
120
+ )
121
+ with gr.Row():
122
+ sample_size = gr.Number(
123
+ value=None,
124
+ label="Number of sample pages (0 will return all pages)",
125
  )
126
  hf_repo = gr.Textbox(
127
  label="Hugging Face Repo", placeholder="username/repo-name"
128
  )
129
 
130
  output_gallery = gr.Gallery(label="Converted Images")
131
+ status_text = gr.Markdown(label="Status")
132
+ download_button = gr.File(label="Download Converted Images")
133
 
134
  submit_button = gr.Button("Process PDFs")
135
  submit_button.click(
136
  process_pdfs,
137
  inputs=[pdf_files, sample_size, hf_repo],
138
+ outputs=[output_gallery, download_button, status_text],
139
  )
140
 
141
  # Launch the app
requirements.in CHANGED
@@ -1,5 +1,6 @@
1
- gradio[oauth]==4.44.0
2
- PyPDF2
3
- pdf2image
4
  Pillow
5
- huggingface_hub
 
 
 
 
1
+
 
 
2
  Pillow
3
+ PyPDF2
4
+ gradio[oauth]==4.44.0
5
+ huggingface_hub
6
+ pdf2image