Teboo commited on
Commit
a8696e1
·
verified ·
1 Parent(s): 127b2c3

Update pdfImage.py

Browse files
Files changed (1) hide show
  1. pdfImage.py +292 -292
pdfImage.py CHANGED
@@ -1,292 +1,292 @@
1
- import fitz
2
- from PIL import Image
3
- from utils import *
4
- from whoosh.analysis import Tokenizer, Token
5
- import jieba
6
- from whoosh.index import create_in
7
- from whoosh.fields import *
8
- from whoosh.qparser import QueryParser
9
- import os
10
- import shutil
11
- # import tempfile
12
-
13
- LOGO_WIDTH = 398
14
- LOGO_HEIGHT = 137
15
-
16
- ix = None
17
- writer = None
18
-
19
-
20
- class ChineseTokenizer(Tokenizer):
21
- def __call__(self, value, positions=False, chars=False,
22
- keeporiginal=False, removestops=True,
23
- start_pos=0, start_char=0, mode='', **kwargs):
24
- t = Token(positions, chars, removestops=removestops, mode=mode,
25
- **kwargs)
26
- seglist = jieba.cut(value, cut_all=True)
27
- for w in seglist:
28
- t.original = t.text = w
29
- t.boost = 1.0
30
- if positions:
31
- t.pos = start_pos + value.find(w)
32
- if chars:
33
- t.startchar = start_char + value.find(w)
34
- if chars and positions:
35
- t.endchar = start_char + value.find(w) + len(w)
36
- yield t
37
-
38
-
39
- def ChineseAnalyzer():
40
- return ChineseTokenizer()
41
-
42
-
43
- def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
44
- """
45
- Load pdf file, covert to image, description and index it
46
- :param lang:
47
- :param skip_block:
48
- :param skip_page_back:
49
- :param skip_page_front:
50
- :param dpi:
51
- :param file:
52
- :return:
53
- """
54
-
55
- if file.__contains__('\\gradio\\'):
56
- print('gradio file')
57
- doc = fitz.open(file)
58
- else:
59
- print('local file')
60
- doc = fitz.open('using_pdfs/' + file)
61
-
62
- # load pages
63
- pages = []
64
- for i in range(doc.page_count):
65
- page = doc.load_page(i)
66
- pages.append(page)
67
-
68
- # increase dpi to 300
69
- dpi = int(dpi)
70
- scale = dpi / 72 # default dpi of pdf is 72
71
- matrix = fitz.Matrix(scale, scale)
72
- skip_block = int(skip_block)
73
-
74
- base_name = os.path.basename(file).split('.')[0]
75
- path_name = f'images/{base_name}'
76
- if os.path.exists(path_name):
77
- shutil.rmtree(path_name)
78
- os.mkdir(path_name)
79
-
80
- temp_image_dir = path_name
81
- # temp_image_dir = tempfile.mkdtemp(prefix='images_')
82
-
83
- for page in pages[int(skip_page_front):-int(skip_page_back)]: # skip final page
84
-
85
- # part1: get image with description in png-pdf
86
- p1dict = page.get_text('dict')
87
- blocks = p1dict['blocks']
88
- page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
89
- page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
90
-
91
- saved = [] # need to remove if inner a svg image
92
- for i, block in enumerate(blocks[int(skip_block):]): # head and tail of pages should be ignore
93
- if 'image' in block:
94
- # try:
95
- bbox = block['bbox']
96
- # skip image that width=398 and hight=137 -> Typically LOGO
97
- if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
98
- continue
99
- # Scale the bbox coordinates
100
- cropped = page_im.crop([int(i * scale) for i in bbox])
101
- number = block['number']
102
-
103
- file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
104
- image_name = file_name + '.png'
105
- # print(image_name)
106
- cropped.save(image_name)
107
- # # Handle text extraction around the image
108
- text_content = get_text_around_image(blocks[skip_block:], i, lang)
109
- title = get_title_of_image(blocks[skip_block:], i, lang)
110
- # print(text_content[:30])
111
- # print(title)
112
- with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
113
- text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
114
-
115
- saved.append((file_name, [int(i * scale) for i in bbox]))
116
- # except:
117
- # pass
118
-
119
- # part2: get image with description in svg-pdf
120
- svg = page.get_svg_image(matrix=fitz.Identity)
121
- image_clips, svg_blocks = parse_page_svg(svg, page.number)
122
- for clip in image_clips:
123
-
124
- transform = []
125
- for item in clip[0]:
126
- # print(item, type(item))
127
- if item[0] == '.':
128
- transform.append(float('0' + item))
129
- elif item[0] == '-':
130
- transform.append(float('-0' + item[1:]))
131
- else:
132
- transform.append(float(item))
133
- d = clip[1]
134
- page_id = clip[2]
135
- block_id = clip[3]
136
- matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
137
- float_values = [float(value) for value in matches[0]]
138
- box_width = float_values[0]
139
- box_height = float_values[1]
140
- width_scale = transform[0]
141
- height_scale = transform[3]
142
- width_move = transform[4]
143
- height_move = transform[5]
144
- x1 = width_move * scale
145
- y1 = height_move * scale
146
- # x1=347*scale
147
- # y1=587*scale
148
- x2 = x1 + box_width * width_scale * scale
149
- y2 = y1 + box_height * height_scale * scale
150
-
151
- if y1 > y2:
152
- y1, y2 = y2, y1
153
-
154
- # print(x1, y1, x2, y2)
155
- # 3. 截取并保存图像
156
-
157
- # check images in saved, if in or similar, delete it from file system
158
- for i, (file_name, bbox) in enumerate(saved):
159
- if (abs(bbox[0] - x1) < 10\
160
- and abs(bbox[1] - y1) < 10\
161
- and abs(bbox[2] - x2) < 10\
162
- and abs(bbox[3] - y2) < 10) or \
163
- (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
164
- os.remove(file_name + '.png')
165
- os.remove(file_name + '.txt')
166
- saved.pop(i)
167
- break
168
-
169
- cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
170
- file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
171
- image_name = file_name + '.png'
172
- cropped_img.save(image_name)
173
-
174
- # search title and text
175
- text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
176
- title = get_svg_title_around_image(svg_blocks, block_id, lang)
177
- with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
178
- text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
179
-
180
- print(temp_image_dir)
181
- return temp_image_dir
182
-
183
-
184
- def build_index(file, tmp_dir, lang='CN'):
185
- # Define the schema for the index
186
- if lang == 'CN':
187
- schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
188
- else:
189
- schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
190
-
191
- base_name = os.path.basename(file).split('.')[0]
192
- path_name = f'{base_name}'
193
- # index_path = 'indexes/' + path_name + '_index_dir'
194
- index_path = 'indexes/'
195
- # Create an index in a directory
196
- # if os.path.exists(index_path):
197
- # shutil.rmtree(index_path)
198
- # os.mkdir(index_path)
199
- temp_index_dir = index_path
200
- # temp_index_dir = tempfile.mkdtemp(prefix='index_')
201
-
202
- global ix
203
- if ix is None:
204
- ix = create_in(temp_index_dir, schema)
205
- global writer
206
- if writer is None:
207
- writer = ix.writer()
208
-
209
- # Add documents to the index
210
- # base_name = os.path.basename(file).split('.')[0]
211
- # image_path = f'images{base_name}'
212
- # writer = ix.writer()
213
- for file in os.listdir(tmp_dir):
214
- if file.endswith('.txt'):
215
- file_path = os.path.join(tmp_dir, file)
216
- with open(file_path, 'r', encoding='utf-8') as f:
217
- content = f.read()
218
- writer.add_document(file_name=file[:-4], content=content)
219
- print('==========')
220
- print(content)
221
- print("==========")
222
-
223
- writer.commit()
224
- return ix, temp_index_dir
225
-
226
-
227
- def search(ix, query, lang='CN', k=10):
228
-
229
- # Tokenize the query string and join tokens with OR operator
230
- if lang == 'CN':
231
- query_tokens = jieba.cut(query, cut_all=True)
232
- else:
233
- query_tokens = query.split()
234
- or_query = " OR ".join(query_tokens)
235
-
236
- parser = QueryParser("content", ix.schema)
237
- myquery = parser.parse(or_query)
238
-
239
- with ix.searcher() as searcher:
240
- results = searcher.search(myquery, limit=k)
241
-
242
- # Extract and return the file names and descriptions of the top-k hits
243
- results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
244
-
245
- return results_list
246
-
247
-
248
- def return_image(file, results_list, tmp_dir):
249
- # base_name = os.path.basename(file).split('.')[0]
250
- # path_name = f'images{base_name}'
251
- titles = []
252
- images = []
253
- for result in results_list:
254
- title = result[1].split('\n')[0].split(':')[-1]
255
- titles.append(title)
256
- images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
257
- return titles[0], images[0]
258
-
259
-
260
- # file = 'CA-IS372x-datasheet_cn.pdf'
261
- # file = 'CA-IS3086 datasheet_cn.pdf'
262
- # temp_image_dir = load_pdf(file, lang='CN')
263
- # ix, temp_index_dir = build_index(file, temp_image_dir)
264
- # results_list = search(ix, "波形", lang='CN', k=10)
265
- # ret_img = return_image(file, results_list, temp_image_dir)
266
- # print('title: ' + ret_img[0])
267
- # ret_img[1].show()
268
-
269
- # print(os.listdir('using_pdfs'))
270
-
271
- # import tqdm
272
- # for file in tqdm.tqdm(os.listdir('using_pdfs')):
273
- # tmd_dir = load_pdf(file)
274
- # ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
275
- # #
276
- # writer.commit()
277
-
278
- # from whoosh.index import open_dir
279
- # search_ix = open_dir('indexes')
280
- # query = "IF-428x接收端阈值"
281
- # results = search(search_ix, query, lang='CN', k=10)
282
- # for result in results:
283
- # print(result)
284
- #
285
- # from PIL import Image
286
- #
287
- # for result in results:
288
- # image_name = result[0]
289
- # base_name = image_name.split('_img')[0]
290
- # img = Image.open('images/' + base_name + '/' + image_name + '.png')
291
- # image_title = result[1].split('\n')[0].split(':')[1]
292
- # img.show(title=image_title)
 
1
+ import fitz
2
+ from PIL import Image
3
+ from utils import *
4
+ from whoosh.analysis import Tokenizer, Token
5
+ import jieba
6
+ from whoosh.index import create_in
7
+ from whoosh.fields import *
8
+ from whoosh.qparser import QueryParser
9
+ import os
10
+ import shutil
11
+ # import tempfile
12
+
13
+ LOGO_WIDTH = 398
14
+ LOGO_HEIGHT = 137
15
+
16
+ ix = None
17
+ writer = None
18
+
19
+
20
+ class ChineseTokenizer(Tokenizer):
21
+ def __call__(self, value, positions=False, chars=False,
22
+ keeporiginal=False, removestops=True,
23
+ start_pos=0, start_char=0, mode='', **kwargs):
24
+ t = Token(positions, chars, removestops=removestops, mode=mode,
25
+ **kwargs)
26
+ seglist = jieba.cut(value, cut_all=True)
27
+ for w in seglist:
28
+ t.original = t.text = w
29
+ t.boost = 1.0
30
+ if positions:
31
+ t.pos = start_pos + value.find(w)
32
+ if chars:
33
+ t.startchar = start_char + value.find(w)
34
+ if chars and positions:
35
+ t.endchar = start_char + value.find(w) + len(w)
36
+ yield t
37
+
38
+
39
+ def ChineseAnalyzer():
40
+ return ChineseTokenizer()
41
+
42
+
43
+ def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
44
+ """
45
+ Load pdf file, covert to image, description and index it
46
+ :param lang:
47
+ :param skip_block:
48
+ :param skip_page_back:
49
+ :param skip_page_front:
50
+ :param dpi:
51
+ :param file:
52
+ :return:
53
+ """
54
+
55
+ if file.__contains__('\\gradio\\') or file.__contains__('/gradio/'):
56
+ print('gradio file')
57
+ doc = fitz.open(file)
58
+ else:
59
+ print('local file')
60
+ doc = fitz.open('using_pdfs/' + file)
61
+
62
+ # load pages
63
+ pages = []
64
+ for i in range(doc.page_count):
65
+ page = doc.load_page(i)
66
+ pages.append(page)
67
+
68
+ # increase dpi to 300
69
+ dpi = int(dpi)
70
+ scale = dpi / 72 # default dpi of pdf is 72
71
+ matrix = fitz.Matrix(scale, scale)
72
+ skip_block = int(skip_block)
73
+
74
+ base_name = os.path.basename(file).split('.')[0]
75
+ path_name = f'images/{base_name}'
76
+ if os.path.exists(path_name):
77
+ shutil.rmtree(path_name)
78
+ os.mkdir(path_name)
79
+
80
+ temp_image_dir = path_name
81
+ # temp_image_dir = tempfile.mkdtemp(prefix='images_')
82
+
83
+ for page in pages[int(skip_page_front):-int(skip_page_back)]: # skip final page
84
+
85
+ # part1: get image with description in png-pdf
86
+ p1dict = page.get_text('dict')
87
+ blocks = p1dict['blocks']
88
+ page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
89
+ page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
90
+
91
+ saved = [] # need to remove if inner a svg image
92
+ for i, block in enumerate(blocks[int(skip_block):]): # head and tail of pages should be ignore
93
+ if 'image' in block:
94
+ # try:
95
+ bbox = block['bbox']
96
+ # skip image that width=398 and hight=137 -> Typically LOGO
97
+ if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
98
+ continue
99
+ # Scale the bbox coordinates
100
+ cropped = page_im.crop([int(i * scale) for i in bbox])
101
+ number = block['number']
102
+
103
+ file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
104
+ image_name = file_name + '.png'
105
+ # print(image_name)
106
+ cropped.save(image_name)
107
+ # # Handle text extraction around the image
108
+ text_content = get_text_around_image(blocks[skip_block:], i, lang)
109
+ title = get_title_of_image(blocks[skip_block:], i, lang)
110
+ # print(text_content[:30])
111
+ # print(title)
112
+ with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
113
+ text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
114
+
115
+ saved.append((file_name, [int(i * scale) for i in bbox]))
116
+ # except:
117
+ # pass
118
+
119
+ # part2: get image with description in svg-pdf
120
+ svg = page.get_svg_image(matrix=fitz.Identity)
121
+ image_clips, svg_blocks = parse_page_svg(svg, page.number)
122
+ for clip in image_clips:
123
+
124
+ transform = []
125
+ for item in clip[0]:
126
+ # print(item, type(item))
127
+ if item[0] == '.':
128
+ transform.append(float('0' + item))
129
+ elif item[0] == '-':
130
+ transform.append(float('-0' + item[1:]))
131
+ else:
132
+ transform.append(float(item))
133
+ d = clip[1]
134
+ page_id = clip[2]
135
+ block_id = clip[3]
136
+ matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
137
+ float_values = [float(value) for value in matches[0]]
138
+ box_width = float_values[0]
139
+ box_height = float_values[1]
140
+ width_scale = transform[0]
141
+ height_scale = transform[3]
142
+ width_move = transform[4]
143
+ height_move = transform[5]
144
+ x1 = width_move * scale
145
+ y1 = height_move * scale
146
+ # x1=347*scale
147
+ # y1=587*scale
148
+ x2 = x1 + box_width * width_scale * scale
149
+ y2 = y1 + box_height * height_scale * scale
150
+
151
+ if y1 > y2:
152
+ y1, y2 = y2, y1
153
+
154
+ # print(x1, y1, x2, y2)
155
+ # 3. 截取并保存图像
156
+
157
+ # check images in saved, if in or similar, delete it from file system
158
+ for i, (file_name, bbox) in enumerate(saved):
159
+ if (abs(bbox[0] - x1) < 10\
160
+ and abs(bbox[1] - y1) < 10\
161
+ and abs(bbox[2] - x2) < 10\
162
+ and abs(bbox[3] - y2) < 10) or \
163
+ (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
164
+ os.remove(file_name + '.png')
165
+ os.remove(file_name + '.txt')
166
+ saved.pop(i)
167
+ break
168
+
169
+ cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
170
+ file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
171
+ image_name = file_name + '.png'
172
+ cropped_img.save(image_name)
173
+
174
+ # search title and text
175
+ text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
176
+ title = get_svg_title_around_image(svg_blocks, block_id, lang)
177
+ with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
178
+ text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
179
+
180
+ print(temp_image_dir)
181
+ return temp_image_dir
182
+
183
+
184
+ def build_index(file, tmp_dir, lang='CN'):
185
+ # Define the schema for the index
186
+ if lang == 'CN':
187
+ schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
188
+ else:
189
+ schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
190
+
191
+ base_name = os.path.basename(file).split('.')[0]
192
+ path_name = f'{base_name}'
193
+ # index_path = 'indexes/' + path_name + '_index_dir'
194
+ index_path = 'indexes/'
195
+ # Create an index in a directory
196
+ # if os.path.exists(index_path):
197
+ # shutil.rmtree(index_path)
198
+ # os.mkdir(index_path)
199
+ temp_index_dir = index_path
200
+ # temp_index_dir = tempfile.mkdtemp(prefix='index_')
201
+
202
+ global ix
203
+ if ix is None:
204
+ ix = create_in(temp_index_dir, schema)
205
+ global writer
206
+ if writer is None:
207
+ writer = ix.writer()
208
+
209
+ # Add documents to the index
210
+ # base_name = os.path.basename(file).split('.')[0]
211
+ # image_path = f'images{base_name}'
212
+ # writer = ix.writer()
213
+ for file in os.listdir(tmp_dir):
214
+ if file.endswith('.txt'):
215
+ file_path = os.path.join(tmp_dir, file)
216
+ with open(file_path, 'r', encoding='utf-8') as f:
217
+ content = f.read()
218
+ writer.add_document(file_name=file[:-4], content=content)
219
+ print('==========')
220
+ print(content)
221
+ print("==========")
222
+
223
+ writer.commit()
224
+ return ix, temp_index_dir
225
+
226
+
227
+ def search(ix, query, lang='CN', k=10):
228
+
229
+ # Tokenize the query string and join tokens with OR operator
230
+ if lang == 'CN':
231
+ query_tokens = jieba.cut(query, cut_all=True)
232
+ else:
233
+ query_tokens = query.split()
234
+ or_query = " OR ".join(query_tokens)
235
+
236
+ parser = QueryParser("content", ix.schema)
237
+ myquery = parser.parse(or_query)
238
+
239
+ with ix.searcher() as searcher:
240
+ results = searcher.search(myquery, limit=k)
241
+
242
+ # Extract and return the file names and descriptions of the top-k hits
243
+ results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
244
+
245
+ return results_list
246
+
247
+
248
+ def return_image(file, results_list, tmp_dir):
249
+ # base_name = os.path.basename(file).split('.')[0]
250
+ # path_name = f'images{base_name}'
251
+ titles = []
252
+ images = []
253
+ for result in results_list:
254
+ title = result[1].split('\n')[0].split(':')[-1]
255
+ titles.append(title)
256
+ images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
257
+ return titles[0], images[0]
258
+
259
+
260
+ # file = 'CA-IS372x-datasheet_cn.pdf'
261
+ # file = 'CA-IS3086 datasheet_cn.pdf'
262
+ # temp_image_dir = load_pdf(file, lang='CN')
263
+ # ix, temp_index_dir = build_index(file, temp_image_dir)
264
+ # results_list = search(ix, "波形", lang='CN', k=10)
265
+ # ret_img = return_image(file, results_list, temp_image_dir)
266
+ # print('title: ' + ret_img[0])
267
+ # ret_img[1].show()
268
+
269
+ # print(os.listdir('using_pdfs'))
270
+
271
+ # import tqdm
272
+ # for file in tqdm.tqdm(os.listdir('using_pdfs')):
273
+ # tmd_dir = load_pdf(file)
274
+ # ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
275
+ # #
276
+ # writer.commit()
277
+
278
+ # from whoosh.index import open_dir
279
+ # search_ix = open_dir('indexes')
280
+ # query = "IF-428x接收端阈值"
281
+ # results = search(search_ix, query, lang='CN', k=10)
282
+ # for result in results:
283
+ # print(result)
284
+ #
285
+ # from PIL import Image
286
+ #
287
+ # for result in results:
288
+ # image_name = result[0]
289
+ # base_name = image_name.split('_img')[0]
290
+ # img = Image.open('images/' + base_name + '/' + image_name + '.png')
291
+ # image_title = result[1].split('\n')[0].split(':')[1]
292
+ # img.show(title=image_title)