{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "1d6f0077-6e41-4b23-8f85-14dba8160036", "metadata": { "tags": [] }, "outputs": [], "source": [ "import fitz\n", "# from PIL import Image \n", "import os\n", "import json" ] }, { "cell_type": "code", "execution_count": 5, "id": "efea9ac6-7e23-49ca-91e2-f589c94a0f6d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Users\\user\\Desktop\\app_open\\app\n", "C:\\Users\\user\\Desktop\\app_open\\app\\LLaVA.pdf\n", "C:\\Users\\user\\Desktop\\app_open\\app\\Interior.pdf\n" ] } ], "source": [ "pwd = os.getcwd()\n", "source = os.path.join(pwd, 'app')\n", "print(source)\n", "file_1= os.path.join(source,'LLaVA.pdf')\n", "file_2= os.path.join(source,'Interior.pdf')\n", "print(file_1)\n", "print(file_2)" ] }, { "cell_type": "code", "execution_count": 32, "id": "3fd7293a-b835-4803-8e2f-1b546d75adc3", "metadata": {}, "outputs": [], "source": [ "#source_files : 현재 경로\n", "#file_ : 이미지 추출할 pdf 경로\n", "def load_pdf(source_files, file_, pdf_name):\n", "\t# open the file \n", "\tpdf_file = fitz.open(file_) \n", "\timage_counter = 0\n", "\tmetadata = {}\n", "\tfor page_index in range(0,len(pdf_file)): \n", "\t\t# get the page itself \n", "\t\tpage = pdf_file[page_index] \n", "\t\t# get block details from the page\n", "\t\tblocks = pdf_file[page_index].get_text(\"blocks\")\n", "\t\t# get image info and title details \n", "\t\timage_meta = [ (blocks[i][4], blocks[i+1][4]) for i in range(0,len(blocks)) if blocks[i][-1]==1 ] \t\n", "\t\timage_info= [ image_meta[0][0] if image_meta else []] \n", "\t\timage_title = [ image_meta[0][1] if image_meta else []] \n", "\t\t# prepare image meta data from the page\n", "\t\tfor image in page.get_images():\n", "\t\t\timage_id = image[7] # img\n", "\t\t\timage_block_id = image[0] # block number \n", "\t\t\timage_title_block_id = image_block_id+1 # image title block number\n", "\t\t\timage_dim = image[2],image[3] # image dimension details\n", "\t\t\t\n", "\t\t\tprint(f\"[+] Page:{page_index}, Image : {image_id}, Block:{image_block_id}, Image Dim:{image_dim}\")\n", "\t\t\timage_counter = image_counter+1\n", "\n", "\t\t\t# Update metadata dictionary with image information\n", "\t\t\tmetadata[image_counter] = {\n", "\t\t\t\t'page': page_index,\n", "\t\t\t\t'image': image_id,\n", "\t\t\t\t'block': image_block_id,\n", "\t\t\t\t'image_dim': image_dim,\n", "\t\t\t\t'image_info': str(image_info[0]),\n", "\t\t\t\t'image_title': str(image_title[0]),\n", "\t\t\t\t'image_file': f\"{image_id}_{image_block_id}.png\",\n", "\t\t\t\t'image_path': os.path.join(source_files, f\"{image_id}_{image_block_id}.png\")\n", "\t\t\t}\n", "\t\t# save the images to the local file system\n", "\t\t\tpix = fitz.Pixmap(pdf_file, image[0])\n", "\t\t\t# image file name contains image name 'img' and block number\n", " #pix.save(os.path.join(source_files, f\"{image_id}_{image_block_id}.png\"))\n", "\t\t\tpix.save(os.path.join(source_files+'/'+pdf_name, f\"{image_id}_{image_block_id}.png\"))\n", "\tprint(f\"Total Images: {image_counter}\")\n", "\t\n", "\twith open(os.path.join(source, f'metadata.json'),'w') as f:\n", "\t\tjson.dump(metadata,f)\n", "\treturn metadata" ] }, { "cell_type": "code", "execution_count": 33, "id": "256db277", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[+] Page:0, Image : Im0, Block:726, Image Dim:(663, 268)\n", "[+] Page:0, Image : Im1, Block:727, Image Dim:(600, 400)\n", "[+] Page:1, Image : Im0, Block:141, Image Dim:(660, 375)\n", "[+] Page:1, Image : Im0, Block:132, Image Dim:(660, 375)\n", "[+] Page:2, Image : Im0, Block:206, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:164, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:150, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:155, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:203, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:12, Image Dim:(596, 398)\n", "[+] Page:2, Image : Im0, Block:160, Image Dim:(596, 398)\n", "[+] Page:4, Image : Im0, Block:20, Image Dim:(1621, 1080)\n", "[+] Page:5, Image : Im0, Block:23, Image Dim:(1620, 1080)\n", "[+] Page:6, Image : Im0, Block:26, Image Dim:(1620, 1080)\n", "[+] Page:8, Image : Im0, Block:32, Image Dim:(600, 400)\n", "[+] Page:8, Image : Im1, Block:33, Image Dim:(1620, 1080)\n", "[+] Page:9, Image : Im0, Block:36, Image Dim:(1621, 1080)\n", "[+] Page:10, Image : Im0, Block:39, Image Dim:(1621, 1080)\n", "[+] Page:12, Image : Im0, Block:45, Image Dim:(1621, 1080)\n", "[+] Page:13, Image : Im0, Block:48, Image Dim:(1620, 1080)\n", "[+] Page:14, Image : Im0, Block:51, Image Dim:(1620, 1080)\n", "[+] Page:16, Image : Im0, Block:57, Image Dim:(1620, 1080)\n", "[+] Page:17, Image : Im0, Block:60, Image Dim:(1620, 1080)\n", "[+] Page:18, Image : Im0, Block:63, Image Dim:(1621, 1080)\n", "[+] Page:20, Image : Im0, Block:69, Image Dim:(1621, 1080)\n", "[+] Page:21, Image : Im0, Block:72, Image Dim:(1620, 1080)\n", "[+] Page:22, Image : Im0, Block:75, Image Dim:(1620, 1080)\n", "[+] Page:24, Image : Im0, Block:81, Image Dim:(1621, 1080)\n", "[+] Page:25, Image : Im0, Block:84, Image Dim:(1620, 1080)\n", "[+] Page:26, Image : Im0, Block:87, Image Dim:(1620, 1080)\n", "[+] Page:28, Image : Im0, Block:93, Image Dim:(1620, 1080)\n", "[+] Page:29, Image : Im0, Block:96, Image Dim:(1620, 1080)\n", "[+] Page:30, Image : Im0, Block:99, Image Dim:(1573, 1051)\n", "[+] Page:32, Image : Im0, Block:109, Image Dim:(161, 159)\n", "Total Images: 34\n", "[+] Page:2, Image : Im1, Block:216, Image Dim:(2474, 1547)\n", "[+] Page:5, Image : Im3, Block:354, Image Dim:(1657, 1112)\n", "[+] Page:7, Image : Im4, Block:390, Image Dim:(550, 550)\n", "[+] Page:7, Image : Im5, Block:391, Image Dim:(1432, 909)\n", "[+] Page:14, Image : Im6, Block:572, Image Dim:(863, 1030)\n", "[+] Page:15, Image : Image11, Block:596, Image Dim:(514, 514)\n", "[+] Page:15, Image : Image12, Block:597, Image Dim:(782, 446)\n", "[+] Page:15, Image : Image13, Block:598, Image Dim:(782, 446)\n", "[+] Page:15, Image : Image16, Block:599, Image Dim:(119, 132)\n", "[+] Page:15, Image : Image9, Block:595, Image Dim:(104, 104)\n", "[+] Page:16, Image : Image11, Block:617, Image Dim:(547, 400)\n", "[+] Page:16, Image : Image12, Block:618, Image Dim:(119, 132)\n", "[+] Page:16, Image : Image9, Block:616, Image Dim:(104, 104)\n", "[+] Page:16, Image : Image10, Block:628, Image Dim:(104, 104)\n", "[+] Page:16, Image : Image12, Block:629, Image Dim:(119, 132)\n", "[+] Page:16, Image : Image9, Block:627, Image Dim:(599, 400)\n", "[+] Page:17, Image : Image11, Block:644, Image Dim:(355, 227)\n", "[+] Page:17, Image : Image12, Block:645, Image Dim:(120, 132)\n", "[+] Page:17, Image : Image9, Block:643, Image Dim:(104, 104)\n", "[+] Page:17, Image : Image11, Block:656, Image Dim:(363, 315)\n", "[+] Page:17, Image : Image12, Block:657, Image Dim:(531, 400)\n", "[+] Page:17, Image : Image15, Block:658, Image Dim:(119, 132)\n", "[+] Page:17, Image : Image9, Block:655, Image Dim:(104, 104)\n", "[+] Page:18, Image : Im13, Block:580, Image Dim:(116, 177)\n", "[+] Page:18, Image : Image13, Block:679, Image Dim:(119, 132)\n", "[+] Page:18, Image : Image14, Block:680, Image Dim:(329, 329)\n", "[+] Page:18, Image : Image15, Block:681, Image Dim:(333, 327)\n", "[+] Page:18, Image : Image9, Block:678, Image Dim:(104, 104)\n", "[+] Page:22, Image : Im1, Block:216, Image Dim:(2474, 1547)\n", "Total Images: 29\n" ] } ], "source": [ "#source 경로 내 파일 리스트 생성\n", "file_list = os.listdir(source)\n", "\n", "#pdf 리스트 추출\n", "pdf_list=[]\n", "[pdf_list.append(x) for x in file_list if x[-3:]=='pdf']\n", "\n", "#pdf명 파일 생성\n", "for x in pdf_list:\n", " if x[-3:]=='pdf':\n", " if os.path.isdir(source+\"/\"+str(x[:-4])) == True :\n", " pass #파일이 존재하면 pass\n", " else:\n", " os.mkdir(source+\"/\"+str(x[:-4]))\n", "\n", "#pdf에서 이미지 추출해서 pdf 파일명의 폴더에 저장\n", "for i in pdf_list:\n", " load_pdf(source, source+\"/\"+i,str(i[:-4]))" ] }, { "cell_type": "code", "execution_count": null, "id": "0f5c21f0-2a9f-49cc-8699-65af38e58ee3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }