Spaces:

Teboo
/

pdfImgSearcher

Running

App Files Files Community

pdfImgSearcher / img_search.py

Teboo

fix bugs

127b2c3 verified 21 days ago

raw history blame contribute delete

No virus

2.47 kB

	import jieba
	from whoosh.qparser import QueryParser
	from PIL import Image
	from whoosh.index import open_dir
	from whoosh.analysis import Tokenizer, Token


	class ChineseTokenizer(Tokenizer):
	def __call__(self, value, positions=False, chars=False,
	keeporiginal=False, removestops=True,
	start_pos=0, start_char=0, mode='', **kwargs):
	t = Token(positions, chars, removestops=removestops, mode=mode,
	**kwargs)
	seglist = jieba.cut(value, cut_all=True)
	for w in seglist:
	t.original = t.text = w
	t.boost = 1.0
	if positions:
	t.pos = start_pos + value.find(w)
	if chars:
	t.startchar = start_char + value.find(w)
	if chars and positions:
	t.endchar = start_char + value.find(w) + len(w)
	yield t


	def ChineseAnalyzer():
	return ChineseTokenizer()


	def search(query, lang='CN', k=10):

	ix = open_dir('indexes')
	# Tokenize the query string and join tokens with OR operator
	if lang == 'CN':
	query_tokens = jieba.cut(query, cut_all=True)
	else:
	query_tokens = query.split()
	or_query = " OR ".join(query_tokens)

	parser = QueryParser("content", ix.schema)
	myquery = parser.parse(or_query)

	with ix.searcher() as searcher:
	results = searcher.search(myquery, limit=k)

	# Extract and return the file names and descriptions of the top-k hits
	results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]

	images = []
	for result in results_list:
	print(result)
	image_name = result[0]
	base_name = image_name.split('_img')[0]
	image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的
	img = Image.open(image_full_path)
	image_title = result[1].split('\n')[-1].split(':')[1]
	# img.show(title=image_title)
	images.append((img, image_title, result[2]))

	return images


	jieba.cut("") # 用于预加载中文分词词典，建议提前运行这段命令
	# results = search("IF-428x接收端阈值")
	results = search("简化结构图")
	for result in results:
	print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象， result[1]是title，2是相似度打分