Teboo commited on
Commit
127b2c3
1 Parent(s): 19678e6
Files changed (8) hide show
  1. README.md +13 -13
  2. app.py +54 -54
  3. images/placeholder +0 -0
  4. img_search.py +68 -0
  5. indexes/placeholder +0 -0
  6. pdfImage.py +292 -252
  7. requirements.txt +0 -0
  8. utils.py +263 -261
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: PdfImgSearcher
3
- emoji: 😻
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 3.41.2
8
- app_file: app.py
9
- pinned: false
10
- license: other
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: PdfImgSearcher
3
+ emoji: 😻
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.41.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,54 +1,54 @@
1
- import gradio as gr
2
- from pdfImage import *
3
-
4
-
5
- done = False
6
- engine = None
7
- tmp_dir = None
8
-
9
-
10
- def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
11
- global done, engine, tmp_dir
12
- if not done:
13
- # Load PDF, Convert to Image, Description, and Index
14
- tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
15
- ix, _ = build_index(file.name, tmp_dir, lang)
16
- engine = ix
17
- done = True
18
- results_list = search(engine, query, lang)
19
- return return_image(file.name, results_list, tmp_dir)
20
-
21
- # Ensure that the image save directory and index directory are deleted
22
- # base_name = os.path.basename(file).split('.')[0]
23
- # path_name = f'images{base_name}'
24
- # index_path = f'{base_name}_index_dir'
25
- # if os.path.exists(path_name):
26
- # shutil.rmtree(path_name)
27
- # if os.path.exists(index_path):
28
- # shutil.rmtree(index_path)
29
- # return titles, images
30
-
31
-
32
- def display_images(*images):
33
- return images
34
-
35
-
36
- iface = gr.Interface(
37
- fn=main_interface,
38
- inputs=[
39
- gr.inputs.File(label="Upload PDF"),
40
- gr.inputs.Number(default=300, label="DPI"),
41
- gr.inputs.Number(default=0, label="Skip Front Page"),
42
- gr.inputs.Number(default=1, label="Skip Back Page"),
43
- gr.inputs.Number(default=5, label="Skip Block"),
44
- gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
45
- gr.inputs.Textbox(label="Search Query")
46
- ],
47
- outputs=[
48
- gr.outputs.Textbox(label="Title"),
49
- gr.outputs.Image(type="pil", label="Image")
50
- ],
51
- live=False
52
- )
53
-
54
- iface.launch()
 
1
+ import gradio as gr
2
+ from pdfImage import *
3
+
4
+
5
+ done = False
6
+ engine = None
7
+ tmp_dir = None
8
+
9
+
10
+ def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
11
+ global done, engine, tmp_dir
12
+ if not done:
13
+ # Load PDF, Convert to Image, Description, and Index
14
+ tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
15
+ ix, _ = build_index(file.name, tmp_dir, lang)
16
+ engine = ix
17
+ done = True
18
+ results_list = search(engine, query, lang)
19
+ return return_image(file.name, results_list, tmp_dir)
20
+
21
+ # Ensure that the image save directory and index directory are deleted
22
+ # base_name = os.path.basename(file).split('.')[0]
23
+ # path_name = f'images{base_name}'
24
+ # index_path = f'{base_name}_index_dir'
25
+ # if os.path.exists(path_name):
26
+ # shutil.rmtree(path_name)
27
+ # if os.path.exists(index_path):
28
+ # shutil.rmtree(index_path)
29
+ # return titles, images
30
+
31
+
32
+ def display_images(*images):
33
+ return images
34
+
35
+
36
+ iface = gr.Interface(
37
+ fn=main_interface,
38
+ inputs=[
39
+ gr.inputs.File(label="Upload PDF"),
40
+ gr.inputs.Number(default=300, label="DPI"),
41
+ gr.inputs.Number(default=0, label="Skip Front Page"),
42
+ gr.inputs.Number(default=1, label="Skip Back Page"),
43
+ gr.inputs.Number(default=5, label="Skip Block"),
44
+ gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
45
+ gr.inputs.Textbox(label="Search Query")
46
+ ],
47
+ outputs=[
48
+ gr.outputs.Textbox(label="Title"),
49
+ gr.outputs.Image(type="pil", label="Image")
50
+ ],
51
+ live=False
52
+ )
53
+
54
+ iface.launch()
images/placeholder ADDED
File without changes
img_search.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jieba
2
+ from whoosh.qparser import QueryParser
3
+ from PIL import Image
4
+ from whoosh.index import open_dir
5
+ from whoosh.analysis import Tokenizer, Token
6
+
7
+
8
+ class ChineseTokenizer(Tokenizer):
9
+ def __call__(self, value, positions=False, chars=False,
10
+ keeporiginal=False, removestops=True,
11
+ start_pos=0, start_char=0, mode='', **kwargs):
12
+ t = Token(positions, chars, removestops=removestops, mode=mode,
13
+ **kwargs)
14
+ seglist = jieba.cut(value, cut_all=True)
15
+ for w in seglist:
16
+ t.original = t.text = w
17
+ t.boost = 1.0
18
+ if positions:
19
+ t.pos = start_pos + value.find(w)
20
+ if chars:
21
+ t.startchar = start_char + value.find(w)
22
+ if chars and positions:
23
+ t.endchar = start_char + value.find(w) + len(w)
24
+ yield t
25
+
26
+
27
+ def ChineseAnalyzer():
28
+ return ChineseTokenizer()
29
+
30
+
31
+ def search(query, lang='CN', k=10):
32
+
33
+ ix = open_dir('indexes')
34
+ # Tokenize the query string and join tokens with OR operator
35
+ if lang == 'CN':
36
+ query_tokens = jieba.cut(query, cut_all=True)
37
+ else:
38
+ query_tokens = query.split()
39
+ or_query = " OR ".join(query_tokens)
40
+
41
+ parser = QueryParser("content", ix.schema)
42
+ myquery = parser.parse(or_query)
43
+
44
+ with ix.searcher() as searcher:
45
+ results = searcher.search(myquery, limit=k)
46
+
47
+ # Extract and return the file names and descriptions of the top-k hits
48
+ results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
49
+
50
+ images = []
51
+ for result in results_list:
52
+ print(result)
53
+ image_name = result[0]
54
+ base_name = image_name.split('_img')[0]
55
+ image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的
56
+ img = Image.open(image_full_path)
57
+ image_title = result[1].split('\n')[-1].split(':')[1]
58
+ # img.show(title=image_title)
59
+ images.append((img, image_title, result[2]))
60
+
61
+ return images
62
+
63
+
64
+ jieba.cut("") # 用于预加载中文分词词典,建议提前运行这段命令
65
+ # results = search("IF-428x接收端阈值")
66
+ results = search("简化结构图")
67
+ for result in results:
68
+ print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象, result[1]是title,2是相似度打分
indexes/placeholder ADDED
File without changes
pdfImage.py CHANGED
@@ -1,252 +1,292 @@
1
- import fitz
2
- from PIL import Image
3
- from utils import *
4
- from whoosh.analysis import Tokenizer, Token
5
- import jieba
6
- from whoosh.index import create_in
7
- from whoosh.fields import *
8
- from whoosh.qparser import QueryParser
9
- import os
10
- import shutil
11
- import tempfile
12
-
13
- LOGO_WIDTH = 398
14
- LOGO_HEIGHT = 137
15
-
16
-
17
- class ChineseTokenizer(Tokenizer):
18
- def __call__(self, value, positions=False, chars=False,
19
- keeporiginal=False, removestops=True,
20
- start_pos=0, start_char=0, mode='', **kwargs):
21
- t = Token(positions, chars, removestops=removestops, mode=mode,
22
- **kwargs)
23
- seglist = jieba.cut(value, cut_all=True)
24
- for w in seglist:
25
- t.original = t.text = w
26
- t.boost = 1.0
27
- if positions:
28
- t.pos = start_pos + value.find(w)
29
- if chars:
30
- t.startchar = start_char + value.find(w)
31
- if chars and positions:
32
- t.endchar = start_char + value.find(w) + len(w)
33
- yield t
34
-
35
-
36
- def ChineseAnalyzer():
37
- return ChineseTokenizer()
38
-
39
-
40
- def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
41
- """
42
- Load pdf file, covert to image, description and index it
43
- :param lang:
44
- :param skip_block:
45
- :param skip_page_back:
46
- :param skip_page_front:
47
- :param dpi:
48
- :param file:
49
- :return:
50
- """
51
-
52
- doc = fitz.open(file)
53
-
54
- # load pages
55
- pages = []
56
- for i in range(doc.page_count):
57
- page = doc.load_page(i)
58
- pages.append(page)
59
-
60
- # increase dpi to 300
61
- dpi = int(dpi)
62
- scale = dpi / 72 # default dpi of pdf is 72
63
- matrix = fitz.Matrix(scale, scale)
64
- skip_block = int(skip_block)
65
-
66
- # base_name = os.path.basename(file).split('.')[0]
67
- # path_name = f'images{base_name}'
68
- # if os.path.exists(path_name):
69
- # shutil.rmtree(path_name)
70
- # os.mkdir(path_name)
71
- #
72
- # temp_image_dir = path_name
73
- temp_image_dir = tempfile.mkdtemp(prefix='images_')
74
-
75
- for page in pages[int(skip_page_front):-int(skip_page_back)]: # skip final page
76
-
77
- # part1: get image with description in png-pdf
78
- p1dict = page.get_text('dict')
79
- blocks = p1dict['blocks']
80
- page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
81
- page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
82
-
83
- saved = [] # need to remove if inner a svg image
84
- for i, block in enumerate(blocks[int(skip_block):]): # head and tail of pages should be ignore
85
- if 'image' in block:
86
- # try:
87
- bbox = block['bbox']
88
- # skip image that width=398 and hight=137 -> Typically LOGO
89
- if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
90
- continue
91
- # Scale the bbox coordinates
92
- cropped = page_im.crop([int(i * scale) for i in bbox])
93
- number = block['number']
94
-
95
- file_name = temp_image_dir + f'/image_{page.number}_{number}'
96
- image_name = file_name + '.png'
97
- # print(image_name)
98
- cropped.save(image_name)
99
- # # Handle text extraction around the image
100
- text_content = get_text_around_image(blocks[skip_block:], i, lang)
101
- title = get_title_of_image(blocks[skip_block:], i, lang)
102
- # print(text_content[:30])
103
- # print(title)
104
- with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
105
- text_file.write(title + '\n' + text_content.replace('\n', ' '))
106
-
107
- saved.append((file_name, [int(i * scale) for i in bbox]))
108
- # except:
109
- # pass
110
-
111
- # part2: get image with description in svg-pdf
112
- svg = page.get_svg_image(matrix=fitz.Identity)
113
- image_clips, svg_blocks = parse_page_svg(svg, page.number)
114
- for clip in image_clips:
115
-
116
- transform = []
117
- for item in clip[0]:
118
- # print(item, type(item))
119
- if item[0] == '.':
120
- transform.append(float('0' + item))
121
- elif item[0] == '-':
122
- transform.append(float('-0' + item[1:]))
123
- else:
124
- transform.append(float(item))
125
- d = clip[1]
126
- page_id = clip[2]
127
- block_id = clip[3]
128
- matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
129
- float_values = [float(value) for value in matches[0]]
130
- box_width = float_values[0]
131
- box_height = float_values[1]
132
- width_scale = transform[0]
133
- height_scale = transform[3]
134
- width_move = transform[4]
135
- height_move = transform[5]
136
- x1 = width_move * scale
137
- y1 = height_move * scale
138
- # x1=347*scale
139
- # y1=587*scale
140
- x2 = x1 + box_width * width_scale * scale
141
- y2 = y1 + box_height * height_scale * scale
142
-
143
- if y1 > y2:
144
- y1, y2 = y2, y1
145
-
146
- # print(x1, y1, x2, y2)
147
- # 3. 截取并保存图像
148
-
149
- # check images in saved, if in or similar, delete it from file system
150
- for i, (file_name, bbox) in enumerate(saved):
151
- if (abs(bbox[0] - x1) < 10\
152
- and abs(bbox[1] - y1) < 10\
153
- and abs(bbox[2] - x2) < 10\
154
- and abs(bbox[3] - y2) < 10) or \
155
- (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
156
- os.remove(file_name + '.png')
157
- os.remove(file_name + '.txt')
158
- saved.pop(i)
159
- break
160
-
161
- cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
162
- file_name = temp_image_dir + f'/svg_image_{page.number}_{block_id}'
163
- image_name = file_name + '.png'
164
- cropped_img.save(image_name)
165
-
166
- # search title and text
167
- text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
168
- title = get_svg_title_around_image(svg_blocks, block_id, lang)
169
- with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
170
- text_file.write(title + '\n' + text_content.replace('\n', ' '))
171
-
172
- return temp_image_dir
173
-
174
-
175
- def build_index(file, tmp_dir, lang='CN'):
176
- # Define the schema for the index
177
- if lang=='CN':
178
- schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
179
- else:
180
- schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
181
-
182
- # base_name = os.path.basename(file).split('.')[0]
183
- # path_name = f'{base_name}'
184
- # index_path = path_name + '_index_dir'
185
- # # Create an index in a directory
186
- # if os.path.exists(index_path):
187
- # shutil.rmtree(index_path)
188
- # os.mkdir(index_path)
189
- temp_index_dir = tempfile.mkdtemp(prefix='index_')
190
-
191
- ix = create_in(temp_index_dir, schema)
192
-
193
- # Add documents to the index
194
- # base_name = os.path.basename(file).split('.')[0]
195
- # image_path = f'images{base_name}'
196
- writer = ix.writer()
197
- for file in os.listdir(tmp_dir):
198
- if file.endswith('.txt'):
199
- file_path = os.path.join(tmp_dir, file)
200
- with open(file_path, 'r', encoding='utf-8') as f:
201
- content = f.read()
202
- writer.add_document(file_name=file[:-4], content=content)
203
- # print('==========')
204
- # print(content)
205
- # print("==========")
206
-
207
- writer.commit()
208
- return ix, temp_index_dir
209
-
210
-
211
- def search(ix, query, lang='CN', k=10):
212
-
213
- # Tokenize the query string and join tokens with OR operator
214
- if lang == 'CN':
215
- query_tokens = jieba.cut(query, cut_all=True)
216
- else:
217
- query_tokens = query.split()
218
- or_query = " OR ".join(query_tokens)
219
-
220
- parser = QueryParser("content", ix.schema)
221
- myquery = parser.parse(or_query)
222
-
223
- with ix.searcher() as searcher:
224
- results = searcher.search(myquery, limit=k)
225
-
226
- # Extract and return the file names and descriptions of the top-k hits
227
- results_list = [(hit['file_name'], hit.highlights("content"), hit) for hit in results]
228
-
229
- return results_list
230
-
231
-
232
- def return_image(file, results_list, tmp_dir):
233
- # base_name = os.path.basename(file).split('.')[0]
234
- # path_name = f'images{base_name}'
235
- titles = []
236
- images = []
237
- for result in results_list:
238
- title = result[2].fields()['content'].split('\n')[0].split(':')[1]
239
- titles.append(title)
240
- images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
241
- return titles[0], images[0]
242
-
243
-
244
- # file = 'CA-IS372x-datasheet_cn.pdf'
245
- # file = 'CA-IS3086 datasheet_cn.pdf'
246
- # temp_image_dir = load_pdf(file, lang='CN')
247
- # ix, temp_index_dir = build_index(file, temp_image_dir)
248
- # results_list = search(ix, "波形", lang='CN', k=10)
249
- # ret_img = return_image(file, results_list, temp_image_dir)
250
- # print('title: ' + ret_img[0])
251
- # ret_img[1].show()
252
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ from PIL import Image
3
+ from utils import *
4
+ from whoosh.analysis import Tokenizer, Token
5
+ import jieba
6
+ from whoosh.index import create_in
7
+ from whoosh.fields import *
8
+ from whoosh.qparser import QueryParser
9
+ import os
10
+ import shutil
11
+ # import tempfile
12
+
13
+ LOGO_WIDTH = 398
14
+ LOGO_HEIGHT = 137
15
+
16
+ ix = None
17
+ writer = None
18
+
19
+
20
+ class ChineseTokenizer(Tokenizer):
21
+ def __call__(self, value, positions=False, chars=False,
22
+ keeporiginal=False, removestops=True,
23
+ start_pos=0, start_char=0, mode='', **kwargs):
24
+ t = Token(positions, chars, removestops=removestops, mode=mode,
25
+ **kwargs)
26
+ seglist = jieba.cut(value, cut_all=True)
27
+ for w in seglist:
28
+ t.original = t.text = w
29
+ t.boost = 1.0
30
+ if positions:
31
+ t.pos = start_pos + value.find(w)
32
+ if chars:
33
+ t.startchar = start_char + value.find(w)
34
+ if chars and positions:
35
+ t.endchar = start_char + value.find(w) + len(w)
36
+ yield t
37
+
38
+
39
+ def ChineseAnalyzer():
40
+ return ChineseTokenizer()
41
+
42
+
43
+ def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
44
+ """
45
+ Load pdf file, covert to image, description and index it
46
+ :param lang:
47
+ :param skip_block:
48
+ :param skip_page_back:
49
+ :param skip_page_front:
50
+ :param dpi:
51
+ :param file:
52
+ :return:
53
+ """
54
+
55
+ if file.__contains__('\\gradio\\'):
56
+ print('gradio file')
57
+ doc = fitz.open(file)
58
+ else:
59
+ print('local file')
60
+ doc = fitz.open('using_pdfs/' + file)
61
+
62
+ # load pages
63
+ pages = []
64
+ for i in range(doc.page_count):
65
+ page = doc.load_page(i)
66
+ pages.append(page)
67
+
68
+ # increase dpi to 300
69
+ dpi = int(dpi)
70
+ scale = dpi / 72 # default dpi of pdf is 72
71
+ matrix = fitz.Matrix(scale, scale)
72
+ skip_block = int(skip_block)
73
+
74
+ base_name = os.path.basename(file).split('.')[0]
75
+ path_name = f'images/{base_name}'
76
+ if os.path.exists(path_name):
77
+ shutil.rmtree(path_name)
78
+ os.mkdir(path_name)
79
+
80
+ temp_image_dir = path_name
81
+ # temp_image_dir = tempfile.mkdtemp(prefix='images_')
82
+
83
+ for page in pages[int(skip_page_front):-int(skip_page_back)]: # skip final page
84
+
85
+ # part1: get image with description in png-pdf
86
+ p1dict = page.get_text('dict')
87
+ blocks = p1dict['blocks']
88
+ page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
89
+ page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
90
+
91
+ saved = [] # need to remove if inner a svg image
92
+ for i, block in enumerate(blocks[int(skip_block):]): # head and tail of pages should be ignore
93
+ if 'image' in block:
94
+ # try:
95
+ bbox = block['bbox']
96
+ # skip image that width=398 and hight=137 -> Typically LOGO
97
+ if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
98
+ continue
99
+ # Scale the bbox coordinates
100
+ cropped = page_im.crop([int(i * scale) for i in bbox])
101
+ number = block['number']
102
+
103
+ file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
104
+ image_name = file_name + '.png'
105
+ # print(image_name)
106
+ cropped.save(image_name)
107
+ # # Handle text extraction around the image
108
+ text_content = get_text_around_image(blocks[skip_block:], i, lang)
109
+ title = get_title_of_image(blocks[skip_block:], i, lang)
110
+ # print(text_content[:30])
111
+ # print(title)
112
+ with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
113
+ text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
114
+
115
+ saved.append((file_name, [int(i * scale) for i in bbox]))
116
+ # except:
117
+ # pass
118
+
119
+ # part2: get image with description in svg-pdf
120
+ svg = page.get_svg_image(matrix=fitz.Identity)
121
+ image_clips, svg_blocks = parse_page_svg(svg, page.number)
122
+ for clip in image_clips:
123
+
124
+ transform = []
125
+ for item in clip[0]:
126
+ # print(item, type(item))
127
+ if item[0] == '.':
128
+ transform.append(float('0' + item))
129
+ elif item[0] == '-':
130
+ transform.append(float('-0' + item[1:]))
131
+ else:
132
+ transform.append(float(item))
133
+ d = clip[1]
134
+ page_id = clip[2]
135
+ block_id = clip[3]
136
+ matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
137
+ float_values = [float(value) for value in matches[0]]
138
+ box_width = float_values[0]
139
+ box_height = float_values[1]
140
+ width_scale = transform[0]
141
+ height_scale = transform[3]
142
+ width_move = transform[4]
143
+ height_move = transform[5]
144
+ x1 = width_move * scale
145
+ y1 = height_move * scale
146
+ # x1=347*scale
147
+ # y1=587*scale
148
+ x2 = x1 + box_width * width_scale * scale
149
+ y2 = y1 + box_height * height_scale * scale
150
+
151
+ if y1 > y2:
152
+ y1, y2 = y2, y1
153
+
154
+ # print(x1, y1, x2, y2)
155
+ # 3. 截取并保存图像
156
+
157
+ # check images in saved, if in or similar, delete it from file system
158
+ for i, (file_name, bbox) in enumerate(saved):
159
+ if (abs(bbox[0] - x1) < 10\
160
+ and abs(bbox[1] - y1) < 10\
161
+ and abs(bbox[2] - x2) < 10\
162
+ and abs(bbox[3] - y2) < 10) or \
163
+ (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
164
+ os.remove(file_name + '.png')
165
+ os.remove(file_name + '.txt')
166
+ saved.pop(i)
167
+ break
168
+
169
+ cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
170
+ file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
171
+ image_name = file_name + '.png'
172
+ cropped_img.save(image_name)
173
+
174
+ # search title and text
175
+ text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
176
+ title = get_svg_title_around_image(svg_blocks, block_id, lang)
177
+ with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
178
+ text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
179
+
180
+ print(temp_image_dir)
181
+ return temp_image_dir
182
+
183
+
184
+ def build_index(file, tmp_dir, lang='CN'):
185
+ # Define the schema for the index
186
+ if lang == 'CN':
187
+ schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
188
+ else:
189
+ schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
190
+
191
+ base_name = os.path.basename(file).split('.')[0]
192
+ path_name = f'{base_name}'
193
+ # index_path = 'indexes/' + path_name + '_index_dir'
194
+ index_path = 'indexes/'
195
+ # Create an index in a directory
196
+ # if os.path.exists(index_path):
197
+ # shutil.rmtree(index_path)
198
+ # os.mkdir(index_path)
199
+ temp_index_dir = index_path
200
+ # temp_index_dir = tempfile.mkdtemp(prefix='index_')
201
+
202
+ global ix
203
+ if ix is None:
204
+ ix = create_in(temp_index_dir, schema)
205
+ global writer
206
+ if writer is None:
207
+ writer = ix.writer()
208
+
209
+ # Add documents to the index
210
+ # base_name = os.path.basename(file).split('.')[0]
211
+ # image_path = f'images{base_name}'
212
+ # writer = ix.writer()
213
+ for file in os.listdir(tmp_dir):
214
+ if file.endswith('.txt'):
215
+ file_path = os.path.join(tmp_dir, file)
216
+ with open(file_path, 'r', encoding='utf-8') as f:
217
+ content = f.read()
218
+ writer.add_document(file_name=file[:-4], content=content)
219
+ print('==========')
220
+ print(content)
221
+ print("==========")
222
+
223
+ writer.commit()
224
+ return ix, temp_index_dir
225
+
226
+
227
+ def search(ix, query, lang='CN', k=10):
228
+
229
+ # Tokenize the query string and join tokens with OR operator
230
+ if lang == 'CN':
231
+ query_tokens = jieba.cut(query, cut_all=True)
232
+ else:
233
+ query_tokens = query.split()
234
+ or_query = " OR ".join(query_tokens)
235
+
236
+ parser = QueryParser("content", ix.schema)
237
+ myquery = parser.parse(or_query)
238
+
239
+ with ix.searcher() as searcher:
240
+ results = searcher.search(myquery, limit=k)
241
+
242
+ # Extract and return the file names and descriptions of the top-k hits
243
+ results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
244
+
245
+ return results_list
246
+
247
+
248
+ def return_image(file, results_list, tmp_dir):
249
+ # base_name = os.path.basename(file).split('.')[0]
250
+ # path_name = f'images{base_name}'
251
+ titles = []
252
+ images = []
253
+ for result in results_list:
254
+ title = result[1].split('\n')[0].split(':')[-1]
255
+ titles.append(title)
256
+ images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
257
+ return titles[0], images[0]
258
+
259
+
260
+ # file = 'CA-IS372x-datasheet_cn.pdf'
261
+ # file = 'CA-IS3086 datasheet_cn.pdf'
262
+ # temp_image_dir = load_pdf(file, lang='CN')
263
+ # ix, temp_index_dir = build_index(file, temp_image_dir)
264
+ # results_list = search(ix, "波形", lang='CN', k=10)
265
+ # ret_img = return_image(file, results_list, temp_image_dir)
266
+ # print('title: ' + ret_img[0])
267
+ # ret_img[1].show()
268
+
269
+ # print(os.listdir('using_pdfs'))
270
+
271
+ # import tqdm
272
+ # for file in tqdm.tqdm(os.listdir('using_pdfs')):
273
+ # tmd_dir = load_pdf(file)
274
+ # ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
275
+ # #
276
+ # writer.commit()
277
+
278
+ # from whoosh.index import open_dir
279
+ # search_ix = open_dir('indexes')
280
+ # query = "IF-428x接收端阈值"
281
+ # results = search(search_ix, query, lang='CN', k=10)
282
+ # for result in results:
283
+ # print(result)
284
+ #
285
+ # from PIL import Image
286
+ #
287
+ # for result in results:
288
+ # image_name = result[0]
289
+ # base_name = image_name.split('_img')[0]
290
+ # img = Image.open('images/' + base_name + '/' + image_name + '.png')
291
+ # image_title = result[1].split('\n')[0].split(':')[1]
292
+ # img.show(title=image_title)
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
utils.py CHANGED
@@ -1,261 +1,263 @@
1
- import xml.etree.ElementTree as ET
2
-
3
- def get_adjacent_lines(blocks, block_index):
4
- """
5
- Returns two lists: the lines of text before and after the block at block_index.
6
- Each list contains lines in order from closest to furthest from the block.
7
- """
8
- def is_same_line(origin1, origin2):
9
- # Adjust this threshold if needed
10
- THRESHOLD = 10
11
- return abs(origin1[1] - origin2[1]) < THRESHOLD
12
-
13
- def extract_spans_from_blocks(target_blocks):
14
- spans = []
15
- for block in target_blocks:
16
- if 'lines' in block:
17
- for line in block['lines']:
18
- for span in line['spans']:
19
- spans.append(span)
20
- return spans
21
-
22
- def merge_spans_to_lines(spans):
23
- if not spans:
24
- return []
25
-
26
- lines = []
27
- current_line = spans[0]['text']
28
- current_origin = spans[0]['origin']
29
-
30
- for span in spans[1:]:
31
- if is_same_line(span['origin'], current_origin):
32
- current_line += " " + span['text']
33
- else:
34
- lines.append(current_line.strip())
35
- current_line = span['text']
36
- current_origin = span['origin']
37
-
38
- lines.append(current_line.strip())
39
- return lines
40
-
41
- spans_before = extract_spans_from_blocks(blocks[:block_index])
42
- spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
43
-
44
- lines_before = merge_spans_to_lines(spans_before)
45
- lines_after = merge_spans_to_lines(spans_after)
46
-
47
- return lines_before, lines_after
48
-
49
-
50
- def get_text_around_image(blocks, image_index, lang='CN', word_count=50):
51
- before_lines, after_lines = get_adjacent_lines(blocks, image_index)
52
-
53
- # print(before_lines)
54
- # print(after_lines)
55
- text_content = ""
56
- counter = word_count
57
-
58
- # Process lines before the image
59
- for line in reversed(before_lines):
60
- text_content = line + '\n' + text_content
61
- if lang == 'CN':
62
- counter -= len(line)
63
- else:
64
- counter -= len(line.split(' '))
65
- if counter <= 0:
66
- break
67
-
68
- # Reset the word counter for lines after the image
69
- counter = word_count
70
-
71
- # Process lines after the image
72
- for line in after_lines:
73
- text_content += line + '\n'
74
- if lang == 'CN':
75
- counter -= len(line)
76
- else:
77
- counter -= len(line.split(' '))
78
- if counter <= 0:
79
- break
80
-
81
- return text_content.strip()
82
-
83
-
84
- def get_title_of_image(blocks, image_index, lang='CN'):
85
- before_lines, after_lines = get_adjacent_lines(blocks, image_index)
86
-
87
- # Search for a title in the lines before the image
88
- title = None
89
- for line in reversed(before_lines):
90
- if lang == 'CN' and '图' in line:
91
- title = f"title: {line}"
92
- break
93
- elif 'figure' in line.lower():
94
- title = f"title: {line}"
95
- break
96
-
97
- # Search for a title in the lines after the image
98
- for line in after_lines:
99
- if lang == 'CN' and '图 ' in line:
100
- return f"title: {line}"
101
- elif 'figure' in line.lower():
102
- return f"title: {line}"
103
-
104
- return title if title else "title: Not Found"
105
-
106
-
107
- def transform_to_array(trans):
108
- trans = trans.replace('matrix(', '').replace(')', '').split(',')
109
- arr = []
110
- # print(trans)
111
- for item in trans:
112
- # print(item, type(item))
113
- if item[0] == '.':
114
- arr.append(float('0' + item))
115
- elif item[0] == '-':
116
- arr.append(float('-0'+item[1:]))
117
- else:
118
- arr.append(float(item))
119
- return arr
120
-
121
-
122
- def parse_page_svg(svg, page_id):
123
- # 解析SVG内容
124
- root = ET.fromstring(svg)
125
-
126
- # 获取页面大小
127
- width = int(root.get('width').replace('pt', ''))
128
- height = int(root.get('height').replace('pt', ''))
129
-
130
- # 存储clipPaths
131
- clips = {}
132
- for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'):
133
- clips[clip.get('id')] = clip
134
-
135
- # 获取SVG下的第一个g标签
136
- main_g = root.find('{http://www.w3.org/2000/svg}g')
137
-
138
- page_size = f'H{width}V{height}'
139
- gs = main_g.findall('{http://www.w3.org/2000/svg}g')
140
-
141
- block_id = 0
142
- img_clips = []
143
- blocks = []
144
- cache = ""
145
- vertical = None
146
- horizon = None
147
- # 遍历主g标签下的所有子g标签
148
- for g in main_g.findall('{http://www.w3.org/2000/svg}g'):
149
-
150
- # 检查第一个子标签是否为"use"标签并且是否有"data-text"属性
151
- first_child = list(g)[0] if g else None
152
- if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib:
153
- # get all use tags that contains data-text attribute in g tag and print them
154
- for u in g.findall('{http://www.w3.org/2000/svg}use'):
155
- if 'data-text' in u.attrib:
156
- text_vertical = transform_to_array(u.get('transform'))[5]
157
- text_horizon = transform_to_array(u.get('transform'))[4]
158
- if vertical is None or abs(text_vertical - vertical) > 10:
159
- vertical = text_vertical
160
- cache = cache.strip()
161
- if cache != "":
162
- blocks.append(cache)
163
- cache = u.get('data-text')
164
- block_id += 1
165
- else:
166
- # horizon should change
167
- if horizon is None or abs(text_horizon - horizon) > 1:
168
- horizon = text_horizon
169
- cache += u.get('data-text')
170
- continue
171
-
172
- clip_path = g.get('clip-path')
173
- if clip_path and '#clip_' in clip_path:
174
- clip_id = clip_path.split("#")[1].replace(')', '')
175
- if clip_id in clips:
176
- path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path')
177
- transform = path.get('transform')
178
- if not transform:
179
- continue
180
- transform = transform.replace('matrix(', '').replace(')', '')
181
- d = path.get('d')
182
- trans_height = int(float(transform.split(',')[5]))
183
- if not (page_size in d or (transform and trans_height == height)):
184
- # print(page_size in d)
185
- # print(transform and trans_height == height)
186
- # print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height)
187
- # print(f"From Transform: {transform}, D: {d} in page {page_id}")
188
- img_clips.append((transform.split(','), d, page_id, block_id))
189
- blocks.append(f'image_{block_id}')
190
- block_id += 1
191
- else:
192
- for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'):
193
- sub_clip_path = sub_g.get('clip-path')
194
- if sub_clip_path and '#clip_' in sub_clip_path:
195
- sub_clip_id = sub_clip_path.split("#")[1].replace(')', '')
196
- if sub_clip_id in clips:
197
- sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path')
198
- sub_d = sub_path.get('d')
199
- sub_transform = sub_path.get('transform')
200
- sub_transform = sub_transform.replace('matrix(', '').replace(')', '')
201
- subtrans_height = int(float(sub_transform.split(',')[5]))
202
- if not (page_size in sub_d or (sub_transform and subtrans_height == height)):
203
- # print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}")
204
- img_clips.append((sub_transform.split(','), sub_d, page_id, block_id))
205
- blocks.append(f'image_{block_id}')
206
- block_id += 1
207
- break
208
- return img_clips, blocks
209
-
210
-
211
- def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50):
212
- text_content = ""
213
- counter = word_count
214
-
215
- # Process lines before the image
216
- for line in reversed(blocks[:block_id]):
217
- text_content = line + '\n' + text_content
218
- if lang == 'CN':
219
- counter -= len(line)
220
- else:
221
- counter -= len(line.split(' '))
222
- if counter <= 0:
223
- break
224
-
225
- # Reset the word counter for lines after the image
226
- counter = word_count
227
-
228
- # Process lines after the image
229
- for line in blocks[block_id+1:]:
230
- text_content += line + '\n'
231
- if lang == 'CN':
232
- counter -= len(line)
233
- else:
234
- counter -= len(line.split(' '))
235
- if counter <= 0:
236
- break
237
-
238
- return text_content.strip()
239
-
240
-
241
- def get_svg_title_around_image(blocks, block_id, lang='CN'):
242
- # Search for a title in the lines before the image
243
- title = None
244
- for line in reversed(blocks[:block_id]):
245
- if lang == 'CN' and '图' in line:
246
- title = f"title: {line}"
247
- break
248
- elif 'figure' in line.lower():
249
- title = f"title: {line}"
250
- break
251
-
252
- # Search for a title in the lines after the image
253
- for line in blocks[block_id+1:]:
254
- if lang == 'CN' and '图 ' in line:
255
- return f"title: {line}"
256
- elif lang == 'CN' and '图' in line:
257
- return f"title: {line}"
258
- elif 'figure' in line.lower():
259
- return f"title: {line}"
260
-
261
- return title if title else "title: Not Found"
 
 
 
1
+ import xml.etree.ElementTree as ET
2
+
3
+ def get_adjacent_lines(blocks, block_index):
4
+ """
5
+ Returns two lists: the lines of text before and after the block at block_index.
6
+ Each list contains lines in order from closest to furthest from the block.
7
+ """
8
+ def is_same_line(origin1, origin2):
9
+ # Adjust this threshold if needed
10
+ THRESHOLD = 10
11
+ return abs(origin1[1] - origin2[1]) < THRESHOLD
12
+
13
+ def extract_spans_from_blocks(target_blocks):
14
+ spans = []
15
+ for block in target_blocks:
16
+ if 'lines' in block:
17
+ for line in block['lines']:
18
+ for span in line['spans']:
19
+ spans.append(span)
20
+ return spans
21
+
22
+ def merge_spans_to_lines(spans):
23
+ if not spans:
24
+ return []
25
+
26
+ lines = []
27
+ current_line = spans[0]['text']
28
+ current_origin = spans[0]['origin']
29
+
30
+ for span in spans[1:]:
31
+ if is_same_line(span['origin'], current_origin):
32
+ current_line += " " + span['text']
33
+ else:
34
+ lines.append(current_line.strip())
35
+ current_line = span['text']
36
+ current_origin = span['origin']
37
+
38
+ lines.append(current_line.strip())
39
+ return lines
40
+
41
+ spans_before = extract_spans_from_blocks(blocks[:block_index])
42
+ spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
43
+
44
+ lines_before = merge_spans_to_lines(spans_before)
45
+ lines_after = merge_spans_to_lines(spans_after)
46
+
47
+ return lines_before, lines_after
48
+
49
+
50
+ def get_text_around_image(blocks, image_index, lang='CN', word_count=50):
51
+ before_lines, after_lines = get_adjacent_lines(blocks, image_index)
52
+
53
+ # print(before_lines)
54
+ # print(after_lines)
55
+ text_content = ""
56
+ counter = word_count
57
+
58
+ # Process lines before the image
59
+ for line in reversed(before_lines):
60
+ text_content = line + '\n' + text_content
61
+ if lang == 'CN':
62
+ counter -= len(line)
63
+ else:
64
+ counter -= len(line.split(' '))
65
+ if counter <= 0:
66
+ break
67
+
68
+ # Reset the word counter for lines after the image
69
+ counter = word_count
70
+
71
+ # Process lines after the image
72
+ for line in after_lines:
73
+ text_content += line + '\n'
74
+ if lang == 'CN':
75
+ counter -= len(line)
76
+ else:
77
+ counter -= len(line.split(' '))
78
+ if counter <= 0:
79
+ break
80
+
81
+ return text_content.strip()
82
+
83
+
84
+ def get_title_of_image(blocks, image_index, lang='CN'):
85
+ before_lines, after_lines = get_adjacent_lines(blocks, image_index)
86
+
87
+ # Search for a title in the lines before the image
88
+ title = None
89
+ for line in reversed(before_lines):
90
+ if lang == 'CN' and '图' in line:
91
+ title = f"title: {line}"
92
+ break
93
+ elif 'figure' in line.lower():
94
+ title = f"title: {line}"
95
+ break
96
+
97
+ # Search for a title in the lines after the image
98
+ for line in after_lines:
99
+ if lang == 'CN' and '图 ' in line:
100
+ return f"title: {line}"
101
+ elif 'figure' in line.lower():
102
+ return f"title: {line}"
103
+
104
+ if before_lines:
105
+ title = before_lines[-1]
106
+ return title if title else "title: Not Found"
107
+
108
+
109
+ def transform_to_array(trans):
110
+ trans = trans.replace('matrix(', '').replace(')', '').split(',')
111
+ arr = []
112
+ # print(trans)
113
+ for item in trans:
114
+ # print(item, type(item))
115
+ if item[0] == '.':
116
+ arr.append(float('0' + item))
117
+ elif item[0] == '-':
118
+ arr.append(float('-0'+item[1:]))
119
+ else:
120
+ arr.append(float(item))
121
+ return arr
122
+
123
+
124
+ def parse_page_svg(svg, page_id):
125
+ # 解析SVG内容
126
+ root = ET.fromstring(svg)
127
+
128
+ # 获取页面大小
129
+ width = int(float(root.get('width').replace('pt', '')))
130
+ height = int(float(root.get('height').replace('pt', '')))
131
+
132
+ # 存储clipPaths
133
+ clips = {}
134
+ for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'):
135
+ clips[clip.get('id')] = clip
136
+
137
+ # 获取SVG下的第一个g标签
138
+ main_g = root.find('{http://www.w3.org/2000/svg}g')
139
+
140
+ page_size = f'H{width}V{height}'
141
+ gs = main_g.findall('{http://www.w3.org/2000/svg}g')
142
+
143
+ block_id = 0
144
+ img_clips = []
145
+ blocks = []
146
+ cache = ""
147
+ vertical = None
148
+ horizon = None
149
+ # 遍历主g标签下的所有子g标签
150
+ for g in main_g.findall('{http://www.w3.org/2000/svg}g'):
151
+
152
+ # 检查第一个子标签是否为"use"标签并且是否有"data-text"属性
153
+ first_child = list(g)[0] if g else None
154
+ if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib:
155
+ # get all use tags that contains data-text attribute in g tag and print them
156
+ for u in g.findall('{http://www.w3.org/2000/svg}use'):
157
+ if 'data-text' in u.attrib:
158
+ text_vertical = transform_to_array(u.get('transform'))[5]
159
+ text_horizon = transform_to_array(u.get('transform'))[4]
160
+ if vertical is None or abs(text_vertical - vertical) > 10:
161
+ vertical = text_vertical
162
+ cache = cache.strip()
163
+ if cache != "":
164
+ blocks.append(cache)
165
+ cache = u.get('data-text')
166
+ block_id += 1
167
+ else:
168
+ # horizon should change
169
+ if horizon is None or abs(text_horizon - horizon) > 1:
170
+ horizon = text_horizon
171
+ cache += u.get('data-text')
172
+ continue
173
+
174
+ clip_path = g.get('clip-path')
175
+ if clip_path and '#clip_' in clip_path:
176
+ clip_id = clip_path.split("#")[1].replace(')', '')
177
+ if clip_id in clips:
178
+ path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path')
179
+ transform = path.get('transform')
180
+ if not transform:
181
+ continue
182
+ transform = transform.replace('matrix(', '').replace(')', '')
183
+ d = path.get('d')
184
+ trans_height = int(float(transform.split(',')[5]))
185
+ if not (page_size in d or (transform and trans_height == height)):
186
+ # print(page_size in d)
187
+ # print(transform and trans_height == height)
188
+ # print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height)
189
+ # print(f"From Transform: {transform}, D: {d} in page {page_id}")
190
+ img_clips.append((transform.split(','), d, page_id, block_id))
191
+ blocks.append(f'image_{block_id}')
192
+ block_id += 1
193
+ else:
194
+ for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'):
195
+ sub_clip_path = sub_g.get('clip-path')
196
+ if sub_clip_path and '#clip_' in sub_clip_path:
197
+ sub_clip_id = sub_clip_path.split("#")[1].replace(')', '')
198
+ if sub_clip_id in clips:
199
+ sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path')
200
+ sub_d = sub_path.get('d')
201
+ sub_transform = sub_path.get('transform')
202
+ sub_transform = sub_transform.replace('matrix(', '').replace(')', '')
203
+ subtrans_height = int(float(sub_transform.split(',')[5]))
204
+ if not (page_size in sub_d or (sub_transform and subtrans_height == height)):
205
+ # print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}")
206
+ img_clips.append((sub_transform.split(','), sub_d, page_id, block_id))
207
+ blocks.append(f'image_{block_id}')
208
+ block_id += 1
209
+ break
210
+ return img_clips, blocks
211
+
212
+
213
+ def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50):
214
+ text_content = ""
215
+ counter = word_count
216
+
217
+ # Process lines before the image
218
+ for line in reversed(blocks[:block_id]):
219
+ text_content = line + '\n' + text_content
220
+ if lang == 'CN':
221
+ counter -= len(line)
222
+ else:
223
+ counter -= len(line.split(' '))
224
+ if counter <= 0:
225
+ break
226
+
227
+ # Reset the word counter for lines after the image
228
+ counter = word_count
229
+
230
+ # Process lines after the image
231
+ for line in blocks[block_id+1:]:
232
+ text_content += line + '\n'
233
+ if lang == 'CN':
234
+ counter -= len(line)
235
+ else:
236
+ counter -= len(line.split(' '))
237
+ if counter <= 0:
238
+ break
239
+
240
+ return text_content.strip()
241
+
242
+
243
+ def get_svg_title_around_image(blocks, block_id, lang='CN'):
244
+ # Search for a title in the lines before the image
245
+ title = None
246
+ for line in reversed(blocks[:block_id]):
247
+ if lang == 'CN' and '图' in line:
248
+ title = f"title: {line}"
249
+ break
250
+ elif 'figure' in line.lower():
251
+ title = f"title: {line}"
252
+ break
253
+
254
+ # Search for a title in the lines after the image
255
+ for line in blocks[block_id+1:]:
256
+ if lang == 'CN' and '图 ' in line:
257
+ return f"title: {line}"
258
+ elif lang == 'CN' and '图' in line:
259
+ return f"title: {line}"
260
+ elif 'figure' in line.lower():
261
+ return f"title: {line}"
262
+
263
+ return title if title else "title: Not Found"