Spaces:
Runtime error
Runtime error
fix bugs
Browse files- README.md +13 -13
- app.py +54 -54
- images/placeholder +0 -0
- img_search.py +68 -0
- indexes/placeholder +0 -0
- pdfImage.py +292 -252
- requirements.txt +0 -0
- utils.py +263 -261
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
---
|
2 |
-
title: PdfImgSearcher
|
3 |
-
emoji: 😻
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: green
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.41.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: other
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: PdfImgSearcher
|
3 |
+
emoji: 😻
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.41.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: other
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,54 +1,54 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from pdfImage import *
|
3 |
-
|
4 |
-
|
5 |
-
done = False
|
6 |
-
engine = None
|
7 |
-
tmp_dir = None
|
8 |
-
|
9 |
-
|
10 |
-
def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
|
11 |
-
global done, engine, tmp_dir
|
12 |
-
if not done:
|
13 |
-
# Load PDF, Convert to Image, Description, and Index
|
14 |
-
tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
|
15 |
-
ix, _ = build_index(file.name, tmp_dir, lang)
|
16 |
-
engine = ix
|
17 |
-
done = True
|
18 |
-
results_list = search(engine, query, lang)
|
19 |
-
return return_image(file.name, results_list, tmp_dir)
|
20 |
-
|
21 |
-
# Ensure that the image save directory and index directory are deleted
|
22 |
-
# base_name = os.path.basename(file).split('.')[0]
|
23 |
-
# path_name = f'images{base_name}'
|
24 |
-
# index_path = f'{base_name}_index_dir'
|
25 |
-
# if os.path.exists(path_name):
|
26 |
-
# shutil.rmtree(path_name)
|
27 |
-
# if os.path.exists(index_path):
|
28 |
-
# shutil.rmtree(index_path)
|
29 |
-
# return titles, images
|
30 |
-
|
31 |
-
|
32 |
-
def display_images(*images):
|
33 |
-
return images
|
34 |
-
|
35 |
-
|
36 |
-
iface = gr.Interface(
|
37 |
-
fn=main_interface,
|
38 |
-
inputs=[
|
39 |
-
gr.inputs.File(label="Upload PDF"),
|
40 |
-
gr.inputs.Number(default=300, label="DPI"),
|
41 |
-
gr.inputs.Number(default=0, label="Skip Front Page"),
|
42 |
-
gr.inputs.Number(default=1, label="Skip Back Page"),
|
43 |
-
gr.inputs.Number(default=5, label="Skip Block"),
|
44 |
-
gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
|
45 |
-
gr.inputs.Textbox(label="Search Query")
|
46 |
-
],
|
47 |
-
outputs=[
|
48 |
-
gr.outputs.Textbox(label="Title"),
|
49 |
-
gr.outputs.Image(type="pil", label="Image")
|
50 |
-
],
|
51 |
-
live=False
|
52 |
-
)
|
53 |
-
|
54 |
-
iface.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from pdfImage import *
|
3 |
+
|
4 |
+
|
5 |
+
done = False
|
6 |
+
engine = None
|
7 |
+
tmp_dir = None
|
8 |
+
|
9 |
+
|
10 |
+
def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
|
11 |
+
global done, engine, tmp_dir
|
12 |
+
if not done:
|
13 |
+
# Load PDF, Convert to Image, Description, and Index
|
14 |
+
tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
|
15 |
+
ix, _ = build_index(file.name, tmp_dir, lang)
|
16 |
+
engine = ix
|
17 |
+
done = True
|
18 |
+
results_list = search(engine, query, lang)
|
19 |
+
return return_image(file.name, results_list, tmp_dir)
|
20 |
+
|
21 |
+
# Ensure that the image save directory and index directory are deleted
|
22 |
+
# base_name = os.path.basename(file).split('.')[0]
|
23 |
+
# path_name = f'images{base_name}'
|
24 |
+
# index_path = f'{base_name}_index_dir'
|
25 |
+
# if os.path.exists(path_name):
|
26 |
+
# shutil.rmtree(path_name)
|
27 |
+
# if os.path.exists(index_path):
|
28 |
+
# shutil.rmtree(index_path)
|
29 |
+
# return titles, images
|
30 |
+
|
31 |
+
|
32 |
+
def display_images(*images):
|
33 |
+
return images
|
34 |
+
|
35 |
+
|
36 |
+
iface = gr.Interface(
|
37 |
+
fn=main_interface,
|
38 |
+
inputs=[
|
39 |
+
gr.inputs.File(label="Upload PDF"),
|
40 |
+
gr.inputs.Number(default=300, label="DPI"),
|
41 |
+
gr.inputs.Number(default=0, label="Skip Front Page"),
|
42 |
+
gr.inputs.Number(default=1, label="Skip Back Page"),
|
43 |
+
gr.inputs.Number(default=5, label="Skip Block"),
|
44 |
+
gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
|
45 |
+
gr.inputs.Textbox(label="Search Query")
|
46 |
+
],
|
47 |
+
outputs=[
|
48 |
+
gr.outputs.Textbox(label="Title"),
|
49 |
+
gr.outputs.Image(type="pil", label="Image")
|
50 |
+
],
|
51 |
+
live=False
|
52 |
+
)
|
53 |
+
|
54 |
+
iface.launch()
|
images/placeholder
ADDED
File without changes
|
img_search.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import jieba
|
2 |
+
from whoosh.qparser import QueryParser
|
3 |
+
from PIL import Image
|
4 |
+
from whoosh.index import open_dir
|
5 |
+
from whoosh.analysis import Tokenizer, Token
|
6 |
+
|
7 |
+
|
8 |
+
class ChineseTokenizer(Tokenizer):
|
9 |
+
def __call__(self, value, positions=False, chars=False,
|
10 |
+
keeporiginal=False, removestops=True,
|
11 |
+
start_pos=0, start_char=0, mode='', **kwargs):
|
12 |
+
t = Token(positions, chars, removestops=removestops, mode=mode,
|
13 |
+
**kwargs)
|
14 |
+
seglist = jieba.cut(value, cut_all=True)
|
15 |
+
for w in seglist:
|
16 |
+
t.original = t.text = w
|
17 |
+
t.boost = 1.0
|
18 |
+
if positions:
|
19 |
+
t.pos = start_pos + value.find(w)
|
20 |
+
if chars:
|
21 |
+
t.startchar = start_char + value.find(w)
|
22 |
+
if chars and positions:
|
23 |
+
t.endchar = start_char + value.find(w) + len(w)
|
24 |
+
yield t
|
25 |
+
|
26 |
+
|
27 |
+
def ChineseAnalyzer():
|
28 |
+
return ChineseTokenizer()
|
29 |
+
|
30 |
+
|
31 |
+
def search(query, lang='CN', k=10):
|
32 |
+
|
33 |
+
ix = open_dir('indexes')
|
34 |
+
# Tokenize the query string and join tokens with OR operator
|
35 |
+
if lang == 'CN':
|
36 |
+
query_tokens = jieba.cut(query, cut_all=True)
|
37 |
+
else:
|
38 |
+
query_tokens = query.split()
|
39 |
+
or_query = " OR ".join(query_tokens)
|
40 |
+
|
41 |
+
parser = QueryParser("content", ix.schema)
|
42 |
+
myquery = parser.parse(or_query)
|
43 |
+
|
44 |
+
with ix.searcher() as searcher:
|
45 |
+
results = searcher.search(myquery, limit=k)
|
46 |
+
|
47 |
+
# Extract and return the file names and descriptions of the top-k hits
|
48 |
+
results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
|
49 |
+
|
50 |
+
images = []
|
51 |
+
for result in results_list:
|
52 |
+
print(result)
|
53 |
+
image_name = result[0]
|
54 |
+
base_name = image_name.split('_img')[0]
|
55 |
+
image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的
|
56 |
+
img = Image.open(image_full_path)
|
57 |
+
image_title = result[1].split('\n')[-1].split(':')[1]
|
58 |
+
# img.show(title=image_title)
|
59 |
+
images.append((img, image_title, result[2]))
|
60 |
+
|
61 |
+
return images
|
62 |
+
|
63 |
+
|
64 |
+
jieba.cut("") # 用于预加载中文分词词典,建议提前运行这段命令
|
65 |
+
# results = search("IF-428x接收端阈值")
|
66 |
+
results = search("简化结构图")
|
67 |
+
for result in results:
|
68 |
+
print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象, result[1]是title,2是相似度打分
|
indexes/placeholder
ADDED
File without changes
|
pdfImage.py
CHANGED
@@ -1,252 +1,292 @@
|
|
1 |
-
import fitz
|
2 |
-
from PIL import Image
|
3 |
-
from utils import *
|
4 |
-
from whoosh.analysis import Tokenizer, Token
|
5 |
-
import jieba
|
6 |
-
from whoosh.index import create_in
|
7 |
-
from whoosh.fields import *
|
8 |
-
from whoosh.qparser import QueryParser
|
9 |
-
import os
|
10 |
-
import shutil
|
11 |
-
import tempfile
|
12 |
-
|
13 |
-
LOGO_WIDTH = 398
|
14 |
-
LOGO_HEIGHT = 137
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
:param
|
47 |
-
:param
|
48 |
-
:param
|
49 |
-
:
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
#
|
69 |
-
|
70 |
-
#
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
#
|
147 |
-
#
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
#
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
#
|
194 |
-
|
195 |
-
#
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
#
|
250 |
-
#
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
from PIL import Image
|
3 |
+
from utils import *
|
4 |
+
from whoosh.analysis import Tokenizer, Token
|
5 |
+
import jieba
|
6 |
+
from whoosh.index import create_in
|
7 |
+
from whoosh.fields import *
|
8 |
+
from whoosh.qparser import QueryParser
|
9 |
+
import os
|
10 |
+
import shutil
|
11 |
+
# import tempfile
|
12 |
+
|
13 |
+
LOGO_WIDTH = 398
|
14 |
+
LOGO_HEIGHT = 137
|
15 |
+
|
16 |
+
ix = None
|
17 |
+
writer = None
|
18 |
+
|
19 |
+
|
20 |
+
class ChineseTokenizer(Tokenizer):
|
21 |
+
def __call__(self, value, positions=False, chars=False,
|
22 |
+
keeporiginal=False, removestops=True,
|
23 |
+
start_pos=0, start_char=0, mode='', **kwargs):
|
24 |
+
t = Token(positions, chars, removestops=removestops, mode=mode,
|
25 |
+
**kwargs)
|
26 |
+
seglist = jieba.cut(value, cut_all=True)
|
27 |
+
for w in seglist:
|
28 |
+
t.original = t.text = w
|
29 |
+
t.boost = 1.0
|
30 |
+
if positions:
|
31 |
+
t.pos = start_pos + value.find(w)
|
32 |
+
if chars:
|
33 |
+
t.startchar = start_char + value.find(w)
|
34 |
+
if chars and positions:
|
35 |
+
t.endchar = start_char + value.find(w) + len(w)
|
36 |
+
yield t
|
37 |
+
|
38 |
+
|
39 |
+
def ChineseAnalyzer():
|
40 |
+
return ChineseTokenizer()
|
41 |
+
|
42 |
+
|
43 |
+
def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
|
44 |
+
"""
|
45 |
+
Load pdf file, covert to image, description and index it
|
46 |
+
:param lang:
|
47 |
+
:param skip_block:
|
48 |
+
:param skip_page_back:
|
49 |
+
:param skip_page_front:
|
50 |
+
:param dpi:
|
51 |
+
:param file:
|
52 |
+
:return:
|
53 |
+
"""
|
54 |
+
|
55 |
+
if file.__contains__('\\gradio\\'):
|
56 |
+
print('gradio file')
|
57 |
+
doc = fitz.open(file)
|
58 |
+
else:
|
59 |
+
print('local file')
|
60 |
+
doc = fitz.open('using_pdfs/' + file)
|
61 |
+
|
62 |
+
# load pages
|
63 |
+
pages = []
|
64 |
+
for i in range(doc.page_count):
|
65 |
+
page = doc.load_page(i)
|
66 |
+
pages.append(page)
|
67 |
+
|
68 |
+
# increase dpi to 300
|
69 |
+
dpi = int(dpi)
|
70 |
+
scale = dpi / 72 # default dpi of pdf is 72
|
71 |
+
matrix = fitz.Matrix(scale, scale)
|
72 |
+
skip_block = int(skip_block)
|
73 |
+
|
74 |
+
base_name = os.path.basename(file).split('.')[0]
|
75 |
+
path_name = f'images/{base_name}'
|
76 |
+
if os.path.exists(path_name):
|
77 |
+
shutil.rmtree(path_name)
|
78 |
+
os.mkdir(path_name)
|
79 |
+
|
80 |
+
temp_image_dir = path_name
|
81 |
+
# temp_image_dir = tempfile.mkdtemp(prefix='images_')
|
82 |
+
|
83 |
+
for page in pages[int(skip_page_front):-int(skip_page_back)]: # skip final page
|
84 |
+
|
85 |
+
# part1: get image with description in png-pdf
|
86 |
+
p1dict = page.get_text('dict')
|
87 |
+
blocks = p1dict['blocks']
|
88 |
+
page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
|
89 |
+
page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
|
90 |
+
|
91 |
+
saved = [] # need to remove if inner a svg image
|
92 |
+
for i, block in enumerate(blocks[int(skip_block):]): # head and tail of pages should be ignore
|
93 |
+
if 'image' in block:
|
94 |
+
# try:
|
95 |
+
bbox = block['bbox']
|
96 |
+
# skip image that width=398 and hight=137 -> Typically LOGO
|
97 |
+
if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
|
98 |
+
continue
|
99 |
+
# Scale the bbox coordinates
|
100 |
+
cropped = page_im.crop([int(i * scale) for i in bbox])
|
101 |
+
number = block['number']
|
102 |
+
|
103 |
+
file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
|
104 |
+
image_name = file_name + '.png'
|
105 |
+
# print(image_name)
|
106 |
+
cropped.save(image_name)
|
107 |
+
# # Handle text extraction around the image
|
108 |
+
text_content = get_text_around_image(blocks[skip_block:], i, lang)
|
109 |
+
title = get_title_of_image(blocks[skip_block:], i, lang)
|
110 |
+
# print(text_content[:30])
|
111 |
+
# print(title)
|
112 |
+
with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
|
113 |
+
text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
|
114 |
+
|
115 |
+
saved.append((file_name, [int(i * scale) for i in bbox]))
|
116 |
+
# except:
|
117 |
+
# pass
|
118 |
+
|
119 |
+
# part2: get image with description in svg-pdf
|
120 |
+
svg = page.get_svg_image(matrix=fitz.Identity)
|
121 |
+
image_clips, svg_blocks = parse_page_svg(svg, page.number)
|
122 |
+
for clip in image_clips:
|
123 |
+
|
124 |
+
transform = []
|
125 |
+
for item in clip[0]:
|
126 |
+
# print(item, type(item))
|
127 |
+
if item[0] == '.':
|
128 |
+
transform.append(float('0' + item))
|
129 |
+
elif item[0] == '-':
|
130 |
+
transform.append(float('-0' + item[1:]))
|
131 |
+
else:
|
132 |
+
transform.append(float(item))
|
133 |
+
d = clip[1]
|
134 |
+
page_id = clip[2]
|
135 |
+
block_id = clip[3]
|
136 |
+
matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
|
137 |
+
float_values = [float(value) for value in matches[0]]
|
138 |
+
box_width = float_values[0]
|
139 |
+
box_height = float_values[1]
|
140 |
+
width_scale = transform[0]
|
141 |
+
height_scale = transform[3]
|
142 |
+
width_move = transform[4]
|
143 |
+
height_move = transform[5]
|
144 |
+
x1 = width_move * scale
|
145 |
+
y1 = height_move * scale
|
146 |
+
# x1=347*scale
|
147 |
+
# y1=587*scale
|
148 |
+
x2 = x1 + box_width * width_scale * scale
|
149 |
+
y2 = y1 + box_height * height_scale * scale
|
150 |
+
|
151 |
+
if y1 > y2:
|
152 |
+
y1, y2 = y2, y1
|
153 |
+
|
154 |
+
# print(x1, y1, x2, y2)
|
155 |
+
# 3. 截取并保存图像
|
156 |
+
|
157 |
+
# check images in saved, if in or similar, delete it from file system
|
158 |
+
for i, (file_name, bbox) in enumerate(saved):
|
159 |
+
if (abs(bbox[0] - x1) < 10\
|
160 |
+
and abs(bbox[1] - y1) < 10\
|
161 |
+
and abs(bbox[2] - x2) < 10\
|
162 |
+
and abs(bbox[3] - y2) < 10) or \
|
163 |
+
(bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
|
164 |
+
os.remove(file_name + '.png')
|
165 |
+
os.remove(file_name + '.txt')
|
166 |
+
saved.pop(i)
|
167 |
+
break
|
168 |
+
|
169 |
+
cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
|
170 |
+
file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
|
171 |
+
image_name = file_name + '.png'
|
172 |
+
cropped_img.save(image_name)
|
173 |
+
|
174 |
+
# search title and text
|
175 |
+
text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
|
176 |
+
title = get_svg_title_around_image(svg_blocks, block_id, lang)
|
177 |
+
with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
|
178 |
+
text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
|
179 |
+
|
180 |
+
print(temp_image_dir)
|
181 |
+
return temp_image_dir
|
182 |
+
|
183 |
+
|
184 |
+
def build_index(file, tmp_dir, lang='CN'):
|
185 |
+
# Define the schema for the index
|
186 |
+
if lang == 'CN':
|
187 |
+
schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
|
188 |
+
else:
|
189 |
+
schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
|
190 |
+
|
191 |
+
base_name = os.path.basename(file).split('.')[0]
|
192 |
+
path_name = f'{base_name}'
|
193 |
+
# index_path = 'indexes/' + path_name + '_index_dir'
|
194 |
+
index_path = 'indexes/'
|
195 |
+
# Create an index in a directory
|
196 |
+
# if os.path.exists(index_path):
|
197 |
+
# shutil.rmtree(index_path)
|
198 |
+
# os.mkdir(index_path)
|
199 |
+
temp_index_dir = index_path
|
200 |
+
# temp_index_dir = tempfile.mkdtemp(prefix='index_')
|
201 |
+
|
202 |
+
global ix
|
203 |
+
if ix is None:
|
204 |
+
ix = create_in(temp_index_dir, schema)
|
205 |
+
global writer
|
206 |
+
if writer is None:
|
207 |
+
writer = ix.writer()
|
208 |
+
|
209 |
+
# Add documents to the index
|
210 |
+
# base_name = os.path.basename(file).split('.')[0]
|
211 |
+
# image_path = f'images{base_name}'
|
212 |
+
# writer = ix.writer()
|
213 |
+
for file in os.listdir(tmp_dir):
|
214 |
+
if file.endswith('.txt'):
|
215 |
+
file_path = os.path.join(tmp_dir, file)
|
216 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
217 |
+
content = f.read()
|
218 |
+
writer.add_document(file_name=file[:-4], content=content)
|
219 |
+
print('==========')
|
220 |
+
print(content)
|
221 |
+
print("==========")
|
222 |
+
|
223 |
+
writer.commit()
|
224 |
+
return ix, temp_index_dir
|
225 |
+
|
226 |
+
|
227 |
+
def search(ix, query, lang='CN', k=10):
|
228 |
+
|
229 |
+
# Tokenize the query string and join tokens with OR operator
|
230 |
+
if lang == 'CN':
|
231 |
+
query_tokens = jieba.cut(query, cut_all=True)
|
232 |
+
else:
|
233 |
+
query_tokens = query.split()
|
234 |
+
or_query = " OR ".join(query_tokens)
|
235 |
+
|
236 |
+
parser = QueryParser("content", ix.schema)
|
237 |
+
myquery = parser.parse(or_query)
|
238 |
+
|
239 |
+
with ix.searcher() as searcher:
|
240 |
+
results = searcher.search(myquery, limit=k)
|
241 |
+
|
242 |
+
# Extract and return the file names and descriptions of the top-k hits
|
243 |
+
results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
|
244 |
+
|
245 |
+
return results_list
|
246 |
+
|
247 |
+
|
248 |
+
def return_image(file, results_list, tmp_dir):
|
249 |
+
# base_name = os.path.basename(file).split('.')[0]
|
250 |
+
# path_name = f'images{base_name}'
|
251 |
+
titles = []
|
252 |
+
images = []
|
253 |
+
for result in results_list:
|
254 |
+
title = result[1].split('\n')[0].split(':')[-1]
|
255 |
+
titles.append(title)
|
256 |
+
images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
|
257 |
+
return titles[0], images[0]
|
258 |
+
|
259 |
+
|
260 |
+
# file = 'CA-IS372x-datasheet_cn.pdf'
|
261 |
+
# file = 'CA-IS3086 datasheet_cn.pdf'
|
262 |
+
# temp_image_dir = load_pdf(file, lang='CN')
|
263 |
+
# ix, temp_index_dir = build_index(file, temp_image_dir)
|
264 |
+
# results_list = search(ix, "波形", lang='CN', k=10)
|
265 |
+
# ret_img = return_image(file, results_list, temp_image_dir)
|
266 |
+
# print('title: ' + ret_img[0])
|
267 |
+
# ret_img[1].show()
|
268 |
+
|
269 |
+
# print(os.listdir('using_pdfs'))
|
270 |
+
|
271 |
+
# import tqdm
|
272 |
+
# for file in tqdm.tqdm(os.listdir('using_pdfs')):
|
273 |
+
# tmd_dir = load_pdf(file)
|
274 |
+
# ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
|
275 |
+
# #
|
276 |
+
# writer.commit()
|
277 |
+
|
278 |
+
# from whoosh.index import open_dir
|
279 |
+
# search_ix = open_dir('indexes')
|
280 |
+
# query = "IF-428x接收端阈值"
|
281 |
+
# results = search(search_ix, query, lang='CN', k=10)
|
282 |
+
# for result in results:
|
283 |
+
# print(result)
|
284 |
+
#
|
285 |
+
# from PIL import Image
|
286 |
+
#
|
287 |
+
# for result in results:
|
288 |
+
# image_name = result[0]
|
289 |
+
# base_name = image_name.split('_img')[0]
|
290 |
+
# img = Image.open('images/' + base_name + '/' + image_name + '.png')
|
291 |
+
# image_title = result[1].split('\n')[0].split(':')[1]
|
292 |
+
# img.show(title=image_title)
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
utils.py
CHANGED
@@ -1,261 +1,263 @@
|
|
1 |
-
import xml.etree.ElementTree as ET
|
2 |
-
|
3 |
-
def get_adjacent_lines(blocks, block_index):
|
4 |
-
"""
|
5 |
-
Returns two lists: the lines of text before and after the block at block_index.
|
6 |
-
Each list contains lines in order from closest to furthest from the block.
|
7 |
-
"""
|
8 |
-
def is_same_line(origin1, origin2):
|
9 |
-
# Adjust this threshold if needed
|
10 |
-
THRESHOLD = 10
|
11 |
-
return abs(origin1[1] - origin2[1]) < THRESHOLD
|
12 |
-
|
13 |
-
def extract_spans_from_blocks(target_blocks):
|
14 |
-
spans = []
|
15 |
-
for block in target_blocks:
|
16 |
-
if 'lines' in block:
|
17 |
-
for line in block['lines']:
|
18 |
-
for span in line['spans']:
|
19 |
-
spans.append(span)
|
20 |
-
return spans
|
21 |
-
|
22 |
-
def merge_spans_to_lines(spans):
|
23 |
-
if not spans:
|
24 |
-
return []
|
25 |
-
|
26 |
-
lines = []
|
27 |
-
current_line = spans[0]['text']
|
28 |
-
current_origin = spans[0]['origin']
|
29 |
-
|
30 |
-
for span in spans[1:]:
|
31 |
-
if is_same_line(span['origin'], current_origin):
|
32 |
-
current_line += " " + span['text']
|
33 |
-
else:
|
34 |
-
lines.append(current_line.strip())
|
35 |
-
current_line = span['text']
|
36 |
-
current_origin = span['origin']
|
37 |
-
|
38 |
-
lines.append(current_line.strip())
|
39 |
-
return lines
|
40 |
-
|
41 |
-
spans_before = extract_spans_from_blocks(blocks[:block_index])
|
42 |
-
spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
|
43 |
-
|
44 |
-
lines_before = merge_spans_to_lines(spans_before)
|
45 |
-
lines_after = merge_spans_to_lines(spans_after)
|
46 |
-
|
47 |
-
return lines_before, lines_after
|
48 |
-
|
49 |
-
|
50 |
-
def get_text_around_image(blocks, image_index, lang='CN', word_count=50):
|
51 |
-
before_lines, after_lines = get_adjacent_lines(blocks, image_index)
|
52 |
-
|
53 |
-
# print(before_lines)
|
54 |
-
# print(after_lines)
|
55 |
-
text_content = ""
|
56 |
-
counter = word_count
|
57 |
-
|
58 |
-
# Process lines before the image
|
59 |
-
for line in reversed(before_lines):
|
60 |
-
text_content = line + '\n' + text_content
|
61 |
-
if lang == 'CN':
|
62 |
-
counter -= len(line)
|
63 |
-
else:
|
64 |
-
counter -= len(line.split(' '))
|
65 |
-
if counter <= 0:
|
66 |
-
break
|
67 |
-
|
68 |
-
# Reset the word counter for lines after the image
|
69 |
-
counter = word_count
|
70 |
-
|
71 |
-
# Process lines after the image
|
72 |
-
for line in after_lines:
|
73 |
-
text_content += line + '\n'
|
74 |
-
if lang == 'CN':
|
75 |
-
counter -= len(line)
|
76 |
-
else:
|
77 |
-
counter -= len(line.split(' '))
|
78 |
-
if counter <= 0:
|
79 |
-
break
|
80 |
-
|
81 |
-
return text_content.strip()
|
82 |
-
|
83 |
-
|
84 |
-
def get_title_of_image(blocks, image_index, lang='CN'):
|
85 |
-
before_lines, after_lines = get_adjacent_lines(blocks, image_index)
|
86 |
-
|
87 |
-
# Search for a title in the lines before the image
|
88 |
-
title = None
|
89 |
-
for line in reversed(before_lines):
|
90 |
-
if lang == 'CN' and '图' in line:
|
91 |
-
title = f"title: {line}"
|
92 |
-
break
|
93 |
-
elif 'figure' in line.lower():
|
94 |
-
title = f"title: {line}"
|
95 |
-
break
|
96 |
-
|
97 |
-
# Search for a title in the lines after the image
|
98 |
-
for line in after_lines:
|
99 |
-
if lang == 'CN' and '图 ' in line:
|
100 |
-
return f"title: {line}"
|
101 |
-
elif 'figure' in line.lower():
|
102 |
-
return f"title: {line}"
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
arr.append(float('
|
117 |
-
|
118 |
-
arr.append(float(item))
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
# print(
|
187 |
-
# print(
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
counter -= len(line
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
counter -= len(line
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
for
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
return f"title: {line}"
|
258 |
-
elif '
|
259 |
-
return f"title: {line}"
|
260 |
-
|
261 |
-
|
|
|
|
|
|
1 |
+
import xml.etree.ElementTree as ET
|
2 |
+
|
3 |
+
def get_adjacent_lines(blocks, block_index):
|
4 |
+
"""
|
5 |
+
Returns two lists: the lines of text before and after the block at block_index.
|
6 |
+
Each list contains lines in order from closest to furthest from the block.
|
7 |
+
"""
|
8 |
+
def is_same_line(origin1, origin2):
|
9 |
+
# Adjust this threshold if needed
|
10 |
+
THRESHOLD = 10
|
11 |
+
return abs(origin1[1] - origin2[1]) < THRESHOLD
|
12 |
+
|
13 |
+
def extract_spans_from_blocks(target_blocks):
|
14 |
+
spans = []
|
15 |
+
for block in target_blocks:
|
16 |
+
if 'lines' in block:
|
17 |
+
for line in block['lines']:
|
18 |
+
for span in line['spans']:
|
19 |
+
spans.append(span)
|
20 |
+
return spans
|
21 |
+
|
22 |
+
def merge_spans_to_lines(spans):
|
23 |
+
if not spans:
|
24 |
+
return []
|
25 |
+
|
26 |
+
lines = []
|
27 |
+
current_line = spans[0]['text']
|
28 |
+
current_origin = spans[0]['origin']
|
29 |
+
|
30 |
+
for span in spans[1:]:
|
31 |
+
if is_same_line(span['origin'], current_origin):
|
32 |
+
current_line += " " + span['text']
|
33 |
+
else:
|
34 |
+
lines.append(current_line.strip())
|
35 |
+
current_line = span['text']
|
36 |
+
current_origin = span['origin']
|
37 |
+
|
38 |
+
lines.append(current_line.strip())
|
39 |
+
return lines
|
40 |
+
|
41 |
+
spans_before = extract_spans_from_blocks(blocks[:block_index])
|
42 |
+
spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
|
43 |
+
|
44 |
+
lines_before = merge_spans_to_lines(spans_before)
|
45 |
+
lines_after = merge_spans_to_lines(spans_after)
|
46 |
+
|
47 |
+
return lines_before, lines_after
|
48 |
+
|
49 |
+
|
50 |
+
def get_text_around_image(blocks, image_index, lang='CN', word_count=50):
|
51 |
+
before_lines, after_lines = get_adjacent_lines(blocks, image_index)
|
52 |
+
|
53 |
+
# print(before_lines)
|
54 |
+
# print(after_lines)
|
55 |
+
text_content = ""
|
56 |
+
counter = word_count
|
57 |
+
|
58 |
+
# Process lines before the image
|
59 |
+
for line in reversed(before_lines):
|
60 |
+
text_content = line + '\n' + text_content
|
61 |
+
if lang == 'CN':
|
62 |
+
counter -= len(line)
|
63 |
+
else:
|
64 |
+
counter -= len(line.split(' '))
|
65 |
+
if counter <= 0:
|
66 |
+
break
|
67 |
+
|
68 |
+
# Reset the word counter for lines after the image
|
69 |
+
counter = word_count
|
70 |
+
|
71 |
+
# Process lines after the image
|
72 |
+
for line in after_lines:
|
73 |
+
text_content += line + '\n'
|
74 |
+
if lang == 'CN':
|
75 |
+
counter -= len(line)
|
76 |
+
else:
|
77 |
+
counter -= len(line.split(' '))
|
78 |
+
if counter <= 0:
|
79 |
+
break
|
80 |
+
|
81 |
+
return text_content.strip()
|
82 |
+
|
83 |
+
|
84 |
+
def get_title_of_image(blocks, image_index, lang='CN'):
|
85 |
+
before_lines, after_lines = get_adjacent_lines(blocks, image_index)
|
86 |
+
|
87 |
+
# Search for a title in the lines before the image
|
88 |
+
title = None
|
89 |
+
for line in reversed(before_lines):
|
90 |
+
if lang == 'CN' and '图' in line:
|
91 |
+
title = f"title: {line}"
|
92 |
+
break
|
93 |
+
elif 'figure' in line.lower():
|
94 |
+
title = f"title: {line}"
|
95 |
+
break
|
96 |
+
|
97 |
+
# Search for a title in the lines after the image
|
98 |
+
for line in after_lines:
|
99 |
+
if lang == 'CN' and '图 ' in line:
|
100 |
+
return f"title: {line}"
|
101 |
+
elif 'figure' in line.lower():
|
102 |
+
return f"title: {line}"
|
103 |
+
|
104 |
+
if before_lines:
|
105 |
+
title = before_lines[-1]
|
106 |
+
return title if title else "title: Not Found"
|
107 |
+
|
108 |
+
|
109 |
+
def transform_to_array(trans):
|
110 |
+
trans = trans.replace('matrix(', '').replace(')', '').split(',')
|
111 |
+
arr = []
|
112 |
+
# print(trans)
|
113 |
+
for item in trans:
|
114 |
+
# print(item, type(item))
|
115 |
+
if item[0] == '.':
|
116 |
+
arr.append(float('0' + item))
|
117 |
+
elif item[0] == '-':
|
118 |
+
arr.append(float('-0'+item[1:]))
|
119 |
+
else:
|
120 |
+
arr.append(float(item))
|
121 |
+
return arr
|
122 |
+
|
123 |
+
|
124 |
+
def parse_page_svg(svg, page_id):
|
125 |
+
# 解析SVG内容
|
126 |
+
root = ET.fromstring(svg)
|
127 |
+
|
128 |
+
# 获取页面大小
|
129 |
+
width = int(float(root.get('width').replace('pt', '')))
|
130 |
+
height = int(float(root.get('height').replace('pt', '')))
|
131 |
+
|
132 |
+
# 存储clipPaths
|
133 |
+
clips = {}
|
134 |
+
for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'):
|
135 |
+
clips[clip.get('id')] = clip
|
136 |
+
|
137 |
+
# 获取SVG下的第一个g标签
|
138 |
+
main_g = root.find('{http://www.w3.org/2000/svg}g')
|
139 |
+
|
140 |
+
page_size = f'H{width}V{height}'
|
141 |
+
gs = main_g.findall('{http://www.w3.org/2000/svg}g')
|
142 |
+
|
143 |
+
block_id = 0
|
144 |
+
img_clips = []
|
145 |
+
blocks = []
|
146 |
+
cache = ""
|
147 |
+
vertical = None
|
148 |
+
horizon = None
|
149 |
+
# 遍历主g标签下的所有子g标签
|
150 |
+
for g in main_g.findall('{http://www.w3.org/2000/svg}g'):
|
151 |
+
|
152 |
+
# 检查第一个子标签是否为"use"标签并且是否有"data-text"属性
|
153 |
+
first_child = list(g)[0] if g else None
|
154 |
+
if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib:
|
155 |
+
# get all use tags that contains data-text attribute in g tag and print them
|
156 |
+
for u in g.findall('{http://www.w3.org/2000/svg}use'):
|
157 |
+
if 'data-text' in u.attrib:
|
158 |
+
text_vertical = transform_to_array(u.get('transform'))[5]
|
159 |
+
text_horizon = transform_to_array(u.get('transform'))[4]
|
160 |
+
if vertical is None or abs(text_vertical - vertical) > 10:
|
161 |
+
vertical = text_vertical
|
162 |
+
cache = cache.strip()
|
163 |
+
if cache != "":
|
164 |
+
blocks.append(cache)
|
165 |
+
cache = u.get('data-text')
|
166 |
+
block_id += 1
|
167 |
+
else:
|
168 |
+
# horizon should change
|
169 |
+
if horizon is None or abs(text_horizon - horizon) > 1:
|
170 |
+
horizon = text_horizon
|
171 |
+
cache += u.get('data-text')
|
172 |
+
continue
|
173 |
+
|
174 |
+
clip_path = g.get('clip-path')
|
175 |
+
if clip_path and '#clip_' in clip_path:
|
176 |
+
clip_id = clip_path.split("#")[1].replace(')', '')
|
177 |
+
if clip_id in clips:
|
178 |
+
path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path')
|
179 |
+
transform = path.get('transform')
|
180 |
+
if not transform:
|
181 |
+
continue
|
182 |
+
transform = transform.replace('matrix(', '').replace(')', '')
|
183 |
+
d = path.get('d')
|
184 |
+
trans_height = int(float(transform.split(',')[5]))
|
185 |
+
if not (page_size in d or (transform and trans_height == height)):
|
186 |
+
# print(page_size in d)
|
187 |
+
# print(transform and trans_height == height)
|
188 |
+
# print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height)
|
189 |
+
# print(f"From Transform: {transform}, D: {d} in page {page_id}")
|
190 |
+
img_clips.append((transform.split(','), d, page_id, block_id))
|
191 |
+
blocks.append(f'image_{block_id}')
|
192 |
+
block_id += 1
|
193 |
+
else:
|
194 |
+
for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'):
|
195 |
+
sub_clip_path = sub_g.get('clip-path')
|
196 |
+
if sub_clip_path and '#clip_' in sub_clip_path:
|
197 |
+
sub_clip_id = sub_clip_path.split("#")[1].replace(')', '')
|
198 |
+
if sub_clip_id in clips:
|
199 |
+
sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path')
|
200 |
+
sub_d = sub_path.get('d')
|
201 |
+
sub_transform = sub_path.get('transform')
|
202 |
+
sub_transform = sub_transform.replace('matrix(', '').replace(')', '')
|
203 |
+
subtrans_height = int(float(sub_transform.split(',')[5]))
|
204 |
+
if not (page_size in sub_d or (sub_transform and subtrans_height == height)):
|
205 |
+
# print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}")
|
206 |
+
img_clips.append((sub_transform.split(','), sub_d, page_id, block_id))
|
207 |
+
blocks.append(f'image_{block_id}')
|
208 |
+
block_id += 1
|
209 |
+
break
|
210 |
+
return img_clips, blocks
|
211 |
+
|
212 |
+
|
213 |
+
def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50):
|
214 |
+
text_content = ""
|
215 |
+
counter = word_count
|
216 |
+
|
217 |
+
# Process lines before the image
|
218 |
+
for line in reversed(blocks[:block_id]):
|
219 |
+
text_content = line + '\n' + text_content
|
220 |
+
if lang == 'CN':
|
221 |
+
counter -= len(line)
|
222 |
+
else:
|
223 |
+
counter -= len(line.split(' '))
|
224 |
+
if counter <= 0:
|
225 |
+
break
|
226 |
+
|
227 |
+
# Reset the word counter for lines after the image
|
228 |
+
counter = word_count
|
229 |
+
|
230 |
+
# Process lines after the image
|
231 |
+
for line in blocks[block_id+1:]:
|
232 |
+
text_content += line + '\n'
|
233 |
+
if lang == 'CN':
|
234 |
+
counter -= len(line)
|
235 |
+
else:
|
236 |
+
counter -= len(line.split(' '))
|
237 |
+
if counter <= 0:
|
238 |
+
break
|
239 |
+
|
240 |
+
return text_content.strip()
|
241 |
+
|
242 |
+
|
243 |
+
def get_svg_title_around_image(blocks, block_id, lang='CN'):
|
244 |
+
# Search for a title in the lines before the image
|
245 |
+
title = None
|
246 |
+
for line in reversed(blocks[:block_id]):
|
247 |
+
if lang == 'CN' and '图' in line:
|
248 |
+
title = f"title: {line}"
|
249 |
+
break
|
250 |
+
elif 'figure' in line.lower():
|
251 |
+
title = f"title: {line}"
|
252 |
+
break
|
253 |
+
|
254 |
+
# Search for a title in the lines after the image
|
255 |
+
for line in blocks[block_id+1:]:
|
256 |
+
if lang == 'CN' and '图 ' in line:
|
257 |
+
return f"title: {line}"
|
258 |
+
elif lang == 'CN' and '图' in line:
|
259 |
+
return f"title: {line}"
|
260 |
+
elif 'figure' in line.lower():
|
261 |
+
return f"title: {line}"
|
262 |
+
|
263 |
+
return title if title else "title: Not Found"
|