Spaces:
Running
Running
import jieba | |
from whoosh.qparser import QueryParser | |
from PIL import Image | |
from whoosh.index import open_dir | |
from whoosh.analysis import Tokenizer, Token | |
class ChineseTokenizer(Tokenizer): | |
def __call__(self, value, positions=False, chars=False, | |
keeporiginal=False, removestops=True, | |
start_pos=0, start_char=0, mode='', **kwargs): | |
t = Token(positions, chars, removestops=removestops, mode=mode, | |
**kwargs) | |
seglist = jieba.cut(value, cut_all=True) | |
for w in seglist: | |
t.original = t.text = w | |
t.boost = 1.0 | |
if positions: | |
t.pos = start_pos + value.find(w) | |
if chars: | |
t.startchar = start_char + value.find(w) | |
if chars and positions: | |
t.endchar = start_char + value.find(w) + len(w) | |
yield t | |
def ChineseAnalyzer(): | |
return ChineseTokenizer() | |
def search(query, lang='CN', k=10): | |
ix = open_dir('indexes') | |
# Tokenize the query string and join tokens with OR operator | |
if lang == 'CN': | |
query_tokens = jieba.cut(query, cut_all=True) | |
else: | |
query_tokens = query.split() | |
or_query = " OR ".join(query_tokens) | |
parser = QueryParser("content", ix.schema) | |
myquery = parser.parse(or_query) | |
with ix.searcher() as searcher: | |
results = searcher.search(myquery, limit=k) | |
# Extract and return the file names and descriptions of the top-k hits | |
results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results] | |
images = [] | |
for result in results_list: | |
print(result) | |
image_name = result[0] | |
base_name = image_name.split('_img')[0] | |
image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的 | |
img = Image.open(image_full_path) | |
image_title = result[1].split('\n')[-1].split(':')[1] | |
# img.show(title=image_title) | |
images.append((img, image_title, result[2])) | |
return images | |
jieba.cut("") # 用于预加载中文分词词典,建议提前运行这段命令 | |
# results = search("IF-428x接收端阈值") | |
results = search("简化结构图") | |
for result in results: | |
print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象, result[1]是title,2是相似度打分 |