ztor2 commited on
Commit
d6aa4f8
1 Parent(s): 5ff4d7f

Upload extract_imgs.ipynb

Browse files
Files changed (1) hide show
  1. extract_imgs.ipynb +231 -0
extract_imgs.ipynb ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "1d6f0077-6e41-4b23-8f85-14dba8160036",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "import fitz\n",
13
+ "# from PIL import Image \n",
14
+ "import os\n",
15
+ "import json"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 5,
21
+ "id": "efea9ac6-7e23-49ca-91e2-f589c94a0f6d",
22
+ "metadata": {
23
+ "tags": []
24
+ },
25
+ "outputs": [
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "C:\\Users\\user\\Desktop\\app_open\\app\n",
31
+ "C:\\Users\\user\\Desktop\\app_open\\app\\LLaVA.pdf\n",
32
+ "C:\\Users\\user\\Desktop\\app_open\\app\\Interior.pdf\n"
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "pwd = os.getcwd()\n",
38
+ "source = os.path.join(pwd, 'app')\n",
39
+ "print(source)\n",
40
+ "file_1= os.path.join(source,'LLaVA.pdf')\n",
41
+ "file_2= os.path.join(source,'Interior.pdf')\n",
42
+ "print(file_1)\n",
43
+ "print(file_2)"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 32,
49
+ "id": "3fd7293a-b835-4803-8e2f-1b546d75adc3",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "#source_files : 현재 경로\n",
54
+ "#file_ : 이미지 추출할 pdf 경로\n",
55
+ "def load_pdf(source_files, file_, pdf_name):\n",
56
+ "\t# open the file \n",
57
+ "\tpdf_file = fitz.open(file_) \n",
58
+ "\timage_counter = 0\n",
59
+ "\tmetadata = {}\n",
60
+ "\tfor page_index in range(0,len(pdf_file)): \n",
61
+ "\t\t# get the page itself \n",
62
+ "\t\tpage = pdf_file[page_index] \n",
63
+ "\t\t# get block details from the page\n",
64
+ "\t\tblocks = pdf_file[page_index].get_text(\"blocks\")\n",
65
+ "\t\t# get image info and title details \n",
66
+ "\t\timage_meta = [ (blocks[i][4], blocks[i+1][4]) for i in range(0,len(blocks)) if blocks[i][-1]==1 ] \t\n",
67
+ "\t\timage_info= [ image_meta[0][0] if image_meta else []] \n",
68
+ "\t\timage_title = [ image_meta[0][1] if image_meta else []] \n",
69
+ "\t\t# prepare image meta data from the page\n",
70
+ "\t\tfor image in page.get_images():\n",
71
+ "\t\t\timage_id = image[7] # img<no>\n",
72
+ "\t\t\timage_block_id = image[0] # block number \n",
73
+ "\t\t\timage_title_block_id = image_block_id+1 # image title block number\n",
74
+ "\t\t\timage_dim = image[2],image[3] # image dimension details\n",
75
+ "\t\t\t\n",
76
+ "\t\t\tprint(f\"[+] Page:{page_index}, Image : {image_id}, Block:{image_block_id}, Image Dim:{image_dim}\")\n",
77
+ "\t\t\timage_counter = image_counter+1\n",
78
+ "\n",
79
+ "\t\t\t# Update metadata dictionary with image information\n",
80
+ "\t\t\tmetadata[image_counter] = {\n",
81
+ "\t\t\t\t'page': page_index,\n",
82
+ "\t\t\t\t'image': image_id,\n",
83
+ "\t\t\t\t'block': image_block_id,\n",
84
+ "\t\t\t\t'image_dim': image_dim,\n",
85
+ "\t\t\t\t'image_info': str(image_info[0]),\n",
86
+ "\t\t\t\t'image_title': str(image_title[0]),\n",
87
+ "\t\t\t\t'image_file': f\"{image_id}_{image_block_id}.png\",\n",
88
+ "\t\t\t\t'image_path': os.path.join(source_files, f\"{image_id}_{image_block_id}.png\")\n",
89
+ "\t\t\t}\n",
90
+ "\t\t# save the images to the local file system\n",
91
+ "\t\t\tpix = fitz.Pixmap(pdf_file, image[0])\n",
92
+ "\t\t\t# image file name contains image name 'img<no>' and block number\n",
93
+ " #pix.save(os.path.join(source_files, f\"{image_id}_{image_block_id}.png\"))\n",
94
+ "\t\t\tpix.save(os.path.join(source_files+'/'+pdf_name, f\"{image_id}_{image_block_id}.png\"))\n",
95
+ "\tprint(f\"Total Images: {image_counter}\")\n",
96
+ "\t\n",
97
+ "\twith open(os.path.join(source, f'metadata.json'),'w') as f:\n",
98
+ "\t\tjson.dump(metadata,f)\n",
99
+ "\treturn metadata"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 33,
105
+ "id": "256db277",
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "[+] Page:0, Image : Im0, Block:726, Image Dim:(663, 268)\n",
113
+ "[+] Page:0, Image : Im1, Block:727, Image Dim:(600, 400)\n",
114
+ "[+] Page:1, Image : Im0, Block:141, Image Dim:(660, 375)\n",
115
+ "[+] Page:1, Image : Im0, Block:132, Image Dim:(660, 375)\n",
116
+ "[+] Page:2, Image : Im0, Block:206, Image Dim:(596, 398)\n",
117
+ "[+] Page:2, Image : Im0, Block:164, Image Dim:(596, 398)\n",
118
+ "[+] Page:2, Image : Im0, Block:150, Image Dim:(596, 398)\n",
119
+ "[+] Page:2, Image : Im0, Block:155, Image Dim:(596, 398)\n",
120
+ "[+] Page:2, Image : Im0, Block:203, Image Dim:(596, 398)\n",
121
+ "[+] Page:2, Image : Im0, Block:12, Image Dim:(596, 398)\n",
122
+ "[+] Page:2, Image : Im0, Block:160, Image Dim:(596, 398)\n",
123
+ "[+] Page:4, Image : Im0, Block:20, Image Dim:(1621, 1080)\n",
124
+ "[+] Page:5, Image : Im0, Block:23, Image Dim:(1620, 1080)\n",
125
+ "[+] Page:6, Image : Im0, Block:26, Image Dim:(1620, 1080)\n",
126
+ "[+] Page:8, Image : Im0, Block:32, Image Dim:(600, 400)\n",
127
+ "[+] Page:8, Image : Im1, Block:33, Image Dim:(1620, 1080)\n",
128
+ "[+] Page:9, Image : Im0, Block:36, Image Dim:(1621, 1080)\n",
129
+ "[+] Page:10, Image : Im0, Block:39, Image Dim:(1621, 1080)\n",
130
+ "[+] Page:12, Image : Im0, Block:45, Image Dim:(1621, 1080)\n",
131
+ "[+] Page:13, Image : Im0, Block:48, Image Dim:(1620, 1080)\n",
132
+ "[+] Page:14, Image : Im0, Block:51, Image Dim:(1620, 1080)\n",
133
+ "[+] Page:16, Image : Im0, Block:57, Image Dim:(1620, 1080)\n",
134
+ "[+] Page:17, Image : Im0, Block:60, Image Dim:(1620, 1080)\n",
135
+ "[+] Page:18, Image : Im0, Block:63, Image Dim:(1621, 1080)\n",
136
+ "[+] Page:20, Image : Im0, Block:69, Image Dim:(1621, 1080)\n",
137
+ "[+] Page:21, Image : Im0, Block:72, Image Dim:(1620, 1080)\n",
138
+ "[+] Page:22, Image : Im0, Block:75, Image Dim:(1620, 1080)\n",
139
+ "[+] Page:24, Image : Im0, Block:81, Image Dim:(1621, 1080)\n",
140
+ "[+] Page:25, Image : Im0, Block:84, Image Dim:(1620, 1080)\n",
141
+ "[+] Page:26, Image : Im0, Block:87, Image Dim:(1620, 1080)\n",
142
+ "[+] Page:28, Image : Im0, Block:93, Image Dim:(1620, 1080)\n",
143
+ "[+] Page:29, Image : Im0, Block:96, Image Dim:(1620, 1080)\n",
144
+ "[+] Page:30, Image : Im0, Block:99, Image Dim:(1573, 1051)\n",
145
+ "[+] Page:32, Image : Im0, Block:109, Image Dim:(161, 159)\n",
146
+ "Total Images: 34\n",
147
+ "[+] Page:2, Image : Im1, Block:216, Image Dim:(2474, 1547)\n",
148
+ "[+] Page:5, Image : Im3, Block:354, Image Dim:(1657, 1112)\n",
149
+ "[+] Page:7, Image : Im4, Block:390, Image Dim:(550, 550)\n",
150
+ "[+] Page:7, Image : Im5, Block:391, Image Dim:(1432, 909)\n",
151
+ "[+] Page:14, Image : Im6, Block:572, Image Dim:(863, 1030)\n",
152
+ "[+] Page:15, Image : Image11, Block:596, Image Dim:(514, 514)\n",
153
+ "[+] Page:15, Image : Image12, Block:597, Image Dim:(782, 446)\n",
154
+ "[+] Page:15, Image : Image13, Block:598, Image Dim:(782, 446)\n",
155
+ "[+] Page:15, Image : Image16, Block:599, Image Dim:(119, 132)\n",
156
+ "[+] Page:15, Image : Image9, Block:595, Image Dim:(104, 104)\n",
157
+ "[+] Page:16, Image : Image11, Block:617, Image Dim:(547, 400)\n",
158
+ "[+] Page:16, Image : Image12, Block:618, Image Dim:(119, 132)\n",
159
+ "[+] Page:16, Image : Image9, Block:616, Image Dim:(104, 104)\n",
160
+ "[+] Page:16, Image : Image10, Block:628, Image Dim:(104, 104)\n",
161
+ "[+] Page:16, Image : Image12, Block:629, Image Dim:(119, 132)\n",
162
+ "[+] Page:16, Image : Image9, Block:627, Image Dim:(599, 400)\n",
163
+ "[+] Page:17, Image : Image11, Block:644, Image Dim:(355, 227)\n",
164
+ "[+] Page:17, Image : Image12, Block:645, Image Dim:(120, 132)\n",
165
+ "[+] Page:17, Image : Image9, Block:643, Image Dim:(104, 104)\n",
166
+ "[+] Page:17, Image : Image11, Block:656, Image Dim:(363, 315)\n",
167
+ "[+] Page:17, Image : Image12, Block:657, Image Dim:(531, 400)\n",
168
+ "[+] Page:17, Image : Image15, Block:658, Image Dim:(119, 132)\n",
169
+ "[+] Page:17, Image : Image9, Block:655, Image Dim:(104, 104)\n",
170
+ "[+] Page:18, Image : Im13, Block:580, Image Dim:(116, 177)\n",
171
+ "[+] Page:18, Image : Image13, Block:679, Image Dim:(119, 132)\n",
172
+ "[+] Page:18, Image : Image14, Block:680, Image Dim:(329, 329)\n",
173
+ "[+] Page:18, Image : Image15, Block:681, Image Dim:(333, 327)\n",
174
+ "[+] Page:18, Image : Image9, Block:678, Image Dim:(104, 104)\n",
175
+ "[+] Page:22, Image : Im1, Block:216, Image Dim:(2474, 1547)\n",
176
+ "Total Images: 29\n"
177
+ ]
178
+ }
179
+ ],
180
+ "source": [
181
+ "#source 경로 내 파일 리스트 생성\n",
182
+ "file_list = os.listdir(source)\n",
183
+ "\n",
184
+ "#pdf 리스트 추출\n",
185
+ "pdf_list=[]\n",
186
+ "[pdf_list.append(x) for x in file_list if x[-3:]=='pdf']\n",
187
+ "\n",
188
+ "#pdf명 파일 생성\n",
189
+ "for x in pdf_list:\n",
190
+ " if x[-3:]=='pdf':\n",
191
+ " if os.path.isdir(source+\"/\"+str(x[:-4])) == True :\n",
192
+ " pass #파일이 존재하면 pass\n",
193
+ " else:\n",
194
+ " os.mkdir(source+\"/\"+str(x[:-4]))\n",
195
+ "\n",
196
+ "#pdf에서 이미지 추출해서 pdf 파일명의 폴더에 저장\n",
197
+ "for i in pdf_list:\n",
198
+ " load_pdf(source, source+\"/\"+i,str(i[:-4]))"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "id": "0f5c21f0-2a9f-49cc-8699-65af38e58ee3",
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": []
208
+ }
209
+ ],
210
+ "metadata": {
211
+ "kernelspec": {
212
+ "display_name": "Python 3 (ipykernel)",
213
+ "language": "python",
214
+ "name": "python3"
215
+ },
216
+ "language_info": {
217
+ "codemirror_mode": {
218
+ "name": "ipython",
219
+ "version": 3
220
+ },
221
+ "file_extension": ".py",
222
+ "mimetype": "text/x-python",
223
+ "name": "python",
224
+ "nbconvert_exporter": "python",
225
+ "pygments_lexer": "ipython3",
226
+ "version": "3.10.13"
227
+ }
228
+ },
229
+ "nbformat": 4,
230
+ "nbformat_minor": 5
231
+ }