pdfImgSearcher / img_search.py
Teboo's picture
fix bugs
127b2c3 verified
import jieba
from whoosh.qparser import QueryParser
from PIL import Image
from whoosh.index import open_dir
from whoosh.analysis import Tokenizer, Token
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
t = Token(positions, chars, removestops=removestops, mode=mode,
**kwargs)
seglist = jieba.cut(value, cut_all=True)
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos = start_pos + value.find(w)
if chars:
t.startchar = start_char + value.find(w)
if chars and positions:
t.endchar = start_char + value.find(w) + len(w)
yield t
def ChineseAnalyzer():
return ChineseTokenizer()
def search(query, lang='CN', k=10):
ix = open_dir('indexes')
# Tokenize the query string and join tokens with OR operator
if lang == 'CN':
query_tokens = jieba.cut(query, cut_all=True)
else:
query_tokens = query.split()
or_query = " OR ".join(query_tokens)
parser = QueryParser("content", ix.schema)
myquery = parser.parse(or_query)
with ix.searcher() as searcher:
results = searcher.search(myquery, limit=k)
# Extract and return the file names and descriptions of the top-k hits
results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
images = []
for result in results_list:
print(result)
image_name = result[0]
base_name = image_name.split('_img')[0]
image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的
img = Image.open(image_full_path)
image_title = result[1].split('\n')[-1].split(':')[1]
# img.show(title=image_title)
images.append((img, image_title, result[2]))
return images
jieba.cut("") # 用于预加载中文分词词典,建议提前运行这段命令
# results = search("IF-428x接收端阈值")
results = search("简化结构图")
for result in results:
print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象, result[1]是title,2是相似度打分