Spaces:

Teboo
/

pdfImgSearcher

Runtime error

App Files Files Community

Teboo commited on May 13, 2024

Commit

127b2c3

•

1 Parent(s): 19678e6

fix bugs

Browse files

Files changed (8) hide show

README.md +13 -13
app.py +54 -54
images/placeholder +0 -0
img_search.py +68 -0
indexes/placeholder +0 -0
pdfImage.py +292 -252
requirements.txt +0 -0
utils.py +263 -261

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: PdfImgSearcher
-emoji: 😻
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 3.41.2
-app_file: app.py
-pinned: false
-license: other
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: PdfImgSearcher
+emoji: 😻
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 3.41.2
+app_file: app.py
+pinned: false
+license: other
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,54 +1,54 @@
-import gradio as gr
-from pdfImage import *
-done = False
-engine = None
-tmp_dir = None
-def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
-    global done, engine, tmp_dir
-    if not done:
-        # Load PDF, Convert to Image, Description, and Index
-        tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
-        ix, _ = build_index(file.name, tmp_dir, lang)
-        engine = ix
-        done = True
-    results_list = search(engine, query, lang)
-    return return_image(file.name, results_list, tmp_dir)
-    # Ensure that the image save directory and index directory are deleted
-    # base_name = os.path.basename(file).split('.')[0]
-    # path_name = f'images{base_name}'
-    # index_path = f'{base_name}_index_dir'
-    # if os.path.exists(path_name):
-    #     shutil.rmtree(path_name)
-    # if os.path.exists(index_path):
-    #     shutil.rmtree(index_path)
-    # return titles, images
-def display_images(*images):
-    return images
-iface = gr.Interface(
-    fn=main_interface,
-    inputs=[
-        gr.inputs.File(label="Upload PDF"),
-        gr.inputs.Number(default=300, label="DPI"),
-        gr.inputs.Number(default=0, label="Skip Front Page"),
-        gr.inputs.Number(default=1, label="Skip Back Page"),
-        gr.inputs.Number(default=5, label="Skip Block"),
-        gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
-        gr.inputs.Textbox(label="Search Query")
-    ],
-    outputs=[
-        gr.outputs.Textbox(label="Title"),
-        gr.outputs.Image(type="pil", label="Image")
-    ],
-    live=False
-)
-iface.launch()

+import gradio as gr
+from pdfImage import *
+done = False
+engine = None
+tmp_dir = None
+def main_interface(file, dpi, skip_page_front, skip_page_back, skip_block, lang, query):
+    global done, engine, tmp_dir
+    if not done:
+        # Load PDF, Convert to Image, Description, and Index
+        tmp_dir = load_pdf(file.name, dpi, skip_page_front, skip_page_back, skip_block, lang)
+        ix, _ = build_index(file.name, tmp_dir, lang)
+        engine = ix
+        done = True
+    results_list = search(engine, query, lang)
+    return return_image(file.name, results_list, tmp_dir)
+    # Ensure that the image save directory and index directory are deleted
+    # base_name = os.path.basename(file).split('.')[0]
+    # path_name = f'images{base_name}'
+    # index_path = f'{base_name}_index_dir'
+    # if os.path.exists(path_name):
+    #     shutil.rmtree(path_name)
+    # if os.path.exists(index_path):
+    #     shutil.rmtree(index_path)
+    # return titles, images
+def display_images(*images):
+    return images
+iface = gr.Interface(
+    fn=main_interface,
+    inputs=[
+        gr.inputs.File(label="Upload PDF"),
+        gr.inputs.Number(default=300, label="DPI"),
+        gr.inputs.Number(default=0, label="Skip Front Page"),
+        gr.inputs.Number(default=1, label="Skip Back Page"),
+        gr.inputs.Number(default=5, label="Skip Block"),
+        gr.inputs.Dropdown(choices=["CN", "EN"], default="CN", label="Language"),
+        gr.inputs.Textbox(label="Search Query")
+    ],
+    outputs=[
+        gr.outputs.Textbox(label="Title"),
+        gr.outputs.Image(type="pil", label="Image")
+    ],
+    live=False
+)
+iface.launch()

images/placeholder ADDED Viewed

File without changes

img_search.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import jieba
+from whoosh.qparser import QueryParser
+from PIL import Image
+from whoosh.index import open_dir
+from whoosh.analysis import Tokenizer, Token
+class ChineseTokenizer(Tokenizer):
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0, mode='', **kwargs):
+        t = Token(positions, chars, removestops=removestops, mode=mode,
+                  **kwargs)
+        seglist = jieba.cut(value, cut_all=True)
+        for w in seglist:
+            t.original = t.text = w
+            t.boost = 1.0
+            if positions:
+                t.pos = start_pos + value.find(w)
+            if chars:
+                t.startchar = start_char + value.find(w)
+            if chars and positions:
+                t.endchar = start_char + value.find(w) + len(w)
+            yield t
+def ChineseAnalyzer():
+    return ChineseTokenizer()
+def search(query, lang='CN', k=10):
+    ix = open_dir('indexes')
+    # Tokenize the query string and join tokens with OR operator
+    if lang == 'CN':
+        query_tokens = jieba.cut(query, cut_all=True)
+    else:
+        query_tokens = query.split()
+    or_query = " OR ".join(query_tokens)
+    parser = QueryParser("content", ix.schema)
+    myquery = parser.parse(or_query)
+    with ix.searcher() as searcher:
+        results = searcher.search(myquery, limit=k)
+        # Extract and return the file names and descriptions of the top-k hits
+        results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
+        images = []
+        for result in results_list:
+            print(result)
+            image_name = result[0]
+            base_name = image_name.split('_img')[0]
+            image_full_path = 'images/' + base_name + '/' + image_name + '.png' # 这个代码就是构造图片路径的
+            img = Image.open(image_full_path)
+            image_title = result[1].split('\n')[-1].split(':')[1]
+            # img.show(title=image_title)
+            images.append((img, image_title, result[2]))
+        return images
+jieba.cut("") # 用于预加载中文分词词典，建议提前运行这段命令
+# results = search("IF-428x接收端阈值")
+results = search("简化结构图")
+for result in results:
+    print(result[1], result[2]) # result[0] 是图片的 PIL.Image 对象， result[1]是title，2是相似度打分

indexes/placeholder ADDED Viewed

File without changes

pdfImage.py CHANGED Viewed

@@ -1,252 +1,292 @@
-import fitz
-from PIL import Image
-from utils import *
-from whoosh.analysis import Tokenizer, Token
-import jieba
-from whoosh.index import create_in
-from whoosh.fields import *
-from whoosh.qparser import QueryParser
-import os
-import shutil
-import tempfile
-LOGO_WIDTH = 398
-LOGO_HEIGHT = 137
-class ChineseTokenizer(Tokenizer):
-    def __call__(self, value, positions=False, chars=False,
-                 keeporiginal=False, removestops=True,
-                 start_pos=0, start_char=0, mode='', **kwargs):
-        t = Token(positions, chars, removestops=removestops, mode=mode,
-                  **kwargs)
-        seglist = jieba.cut(value, cut_all=True)
-        for w in seglist:
-            t.original = t.text = w
-            t.boost = 1.0
-            if positions:
-                t.pos = start_pos + value.find(w)
-            if chars:
-                t.startchar = start_char + value.find(w)
-            if chars and positions:
-                t.endchar = start_char + value.find(w) + len(w)
-            yield t
-def ChineseAnalyzer():
-    return ChineseTokenizer()
-def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
-    """
-    Load pdf file, covert to image, description and index it
-    :param lang:
-    :param skip_block:
-    :param skip_page_back:
-    :param skip_page_front:
-    :param dpi:
-    :param file:
-    :return:
-    """
-    doc = fitz.open(file)
-    # load pages
-    pages = []
-    for i in range(doc.page_count):
-        page = doc.load_page(i)
-        pages.append(page)
-    # increase dpi to 300
-    dpi = int(dpi)
-    scale = dpi / 72  # default dpi of pdf is 72
-    matrix = fitz.Matrix(scale, scale)
-    skip_block = int(skip_block)
-    # base_name = os.path.basename(file).split('.')[0]
-    # path_name = f'images{base_name}'
-    # if os.path.exists(path_name):
-    #     shutil.rmtree(path_name)
-    # os.mkdir(path_name)
-    #
-    # temp_image_dir = path_name
-    temp_image_dir = tempfile.mkdtemp(prefix='images_')
-    for page in pages[int(skip_page_front):-int(skip_page_back)]:  # skip final page
-        # part1: get image with description in png-pdf
-        p1dict = page.get_text('dict')
-        blocks = p1dict['blocks']
-        page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
-        page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
-        saved = []  # need to remove if inner a svg image
-        for i, block in enumerate(blocks[int(skip_block):]):  # head and tail of pages should be ignore
-            if 'image' in block:
-                # try:
-                    bbox = block['bbox']
-                    # skip image that width=398 and hight=137 -> Typically LOGO
-                    if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
-                        continue
-                    # Scale the bbox coordinates
-                    cropped = page_im.crop([int(i * scale) for i in bbox])
-                    number = block['number']
-                    file_name = temp_image_dir + f'/image_{page.number}_{number}'
-                    image_name = file_name + '.png'
-                    # print(image_name)
-                    cropped.save(image_name)
-                    # # Handle text extraction around the image
-                    text_content = get_text_around_image(blocks[skip_block:], i, lang)
-                    title = get_title_of_image(blocks[skip_block:], i, lang)
-                    # print(text_content[:30])
-                    # print(title)
-                    with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
-                        text_file.write(title + '\n' + text_content.replace('\n', ' '))
-                    saved.append((file_name, [int(i * scale) for i in bbox]))
-                # except:
-                #     pass
-        # part2: get image with description in svg-pdf
-        svg = page.get_svg_image(matrix=fitz.Identity)
-        image_clips, svg_blocks = parse_page_svg(svg, page.number)
-        for clip in image_clips:
-            transform = []
-            for item in clip[0]:
-                # print(item, type(item))
-                if item[0] == '.':
-                    transform.append(float('0' + item))
-                elif item[0] == '-':
-                    transform.append(float('-0' + item[1:]))
-                else:
-                    transform.append(float(item))
-            d = clip[1]
-            page_id = clip[2]
-            block_id = clip[3]
-            matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
-            float_values = [float(value) for value in matches[0]]
-            box_width = float_values[0]
-            box_height = float_values[1]
-            width_scale = transform[0]
-            height_scale = transform[3]
-            width_move = transform[4]
-            height_move = transform[5]
-            x1 = width_move * scale
-            y1 = height_move * scale
-            # x1=347*scale
-            # y1=587*scale
-            x2 = x1 + box_width * width_scale * scale
-            y2 = y1 + box_height * height_scale * scale
-            if y1 > y2:
-                y1, y2 = y2, y1
-            # print(x1, y1, x2, y2)
-            # 3. 截取并保存图像
-            # check images in saved, if in or similar, delete it from file system
-            for i, (file_name, bbox) in enumerate(saved):
-                if (abs(bbox[0] - x1) < 10\
-                        and abs(bbox[1] - y1) < 10\
-                        and abs(bbox[2] - x2) < 10\
-                        and abs(bbox[3] - y2) < 10) or \
-                        (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
-                    os.remove(file_name + '.png')
-                    os.remove(file_name + '.txt')
-                    saved.pop(i)
-                    break
-            cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
-            file_name = temp_image_dir + f'/svg_image_{page.number}_{block_id}'
-            image_name = file_name + '.png'
-            cropped_img.save(image_name)
-            # search title and text
-            text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
-            title = get_svg_title_around_image(svg_blocks, block_id, lang)
-            with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
-                text_file.write(title + '\n' + text_content.replace('\n', ' '))
-    return temp_image_dir
-def build_index(file, tmp_dir, lang='CN'):
-    # Define the schema for the index
-    if lang=='CN':
-        schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
-    else:
-        schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
-    # base_name = os.path.basename(file).split('.')[0]
-    # path_name = f'{base_name}'
-    # index_path = path_name + '_index_dir'
-    # # Create an index in a directory
-    # if os.path.exists(index_path):
-    #     shutil.rmtree(index_path)
-    # os.mkdir(index_path)
-    temp_index_dir = tempfile.mkdtemp(prefix='index_')
-    ix = create_in(temp_index_dir, schema)
-    # Add documents to the index
-    # base_name = os.path.basename(file).split('.')[0]
-    # image_path = f'images{base_name}'
-    writer = ix.writer()
-    for file in os.listdir(tmp_dir):
-        if file.endswith('.txt'):
-            file_path = os.path.join(tmp_dir, file)
-            with open(file_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            writer.add_document(file_name=file[:-4], content=content)
-            # print('==========')
-            # print(content)
-            # print("==========")
-    writer.commit()
-    return ix, temp_index_dir
-def search(ix, query, lang='CN', k=10):
-    # Tokenize the query string and join tokens with OR operator
-    if lang == 'CN':
-        query_tokens = jieba.cut(query, cut_all=True)
-    else:
-        query_tokens = query.split()
-    or_query = " OR ".join(query_tokens)
-    parser = QueryParser("content", ix.schema)
-    myquery = parser.parse(or_query)
-    with ix.searcher() as searcher:
-        results = searcher.search(myquery, limit=k)
-        # Extract and return the file names and descriptions of the top-k hits
-        results_list = [(hit['file_name'], hit.highlights("content"), hit) for hit in results]
-    return results_list
-def return_image(file, results_list, tmp_dir):
-    # base_name = os.path.basename(file).split('.')[0]
-    # path_name = f'images{base_name}'
-    titles = []
-    images = []
-    for result in results_list:
-        title = result[2].fields()['content'].split('\n')[0].split(':')[1]
-        titles.append(title)
-        images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
-    return titles[0], images[0]
-# file = 'CA-IS372x-datasheet_cn.pdf'
-# file = 'CA-IS3086 datasheet_cn.pdf'
-# temp_image_dir = load_pdf(file, lang='CN')
-# ix, temp_index_dir = build_index(file, temp_image_dir)
-# results_list = search(ix, "波形", lang='CN', k=10)
-# ret_img = return_image(file, results_list, temp_image_dir)
-# print('title: ' + ret_img[0])
-# ret_img[1].show()

+import fitz
+from PIL import Image
+from utils import *
+from whoosh.analysis import Tokenizer, Token
+import jieba
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+import os
+import shutil
+# import tempfile
+LOGO_WIDTH = 398
+LOGO_HEIGHT = 137
+ix = None
+writer = None
+class ChineseTokenizer(Tokenizer):
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0, mode='', **kwargs):
+        t = Token(positions, chars, removestops=removestops, mode=mode,
+                  **kwargs)
+        seglist = jieba.cut(value, cut_all=True)
+        for w in seglist:
+            t.original = t.text = w
+            t.boost = 1.0
+            if positions:
+                t.pos = start_pos + value.find(w)
+            if chars:
+                t.startchar = start_char + value.find(w)
+            if chars and positions:
+                t.endchar = start_char + value.find(w) + len(w)
+            yield t
+def ChineseAnalyzer():
+    return ChineseTokenizer()
+def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
+    """
+    Load pdf file, covert to image, description and index it
+    :param lang:
+    :param skip_block:
+    :param skip_page_back:
+    :param skip_page_front:
+    :param dpi:
+    :param file:
+    :return:
+    """
+    if file.__contains__('\\gradio\\'):
+        print('gradio file')
+        doc = fitz.open(file)
+    else:
+        print('local file')
+        doc = fitz.open('using_pdfs/' + file)
+    # load pages
+    pages = []
+    for i in range(doc.page_count):
+        page = doc.load_page(i)
+        pages.append(page)
+    # increase dpi to 300
+    dpi = int(dpi)
+    scale = dpi / 72  # default dpi of pdf is 72
+    matrix = fitz.Matrix(scale, scale)
+    skip_block = int(skip_block)
+    base_name = os.path.basename(file).split('.')[0]
+    path_name = f'images/{base_name}'
+    if os.path.exists(path_name):
+        shutil.rmtree(path_name)
+    os.mkdir(path_name)
+    temp_image_dir = path_name
+    # temp_image_dir = tempfile.mkdtemp(prefix='images_')
+    for page in pages[int(skip_page_front):-int(skip_page_back)]:  # skip final page
+        # part1: get image with description in png-pdf
+        p1dict = page.get_text('dict')
+        blocks = p1dict['blocks']
+        page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
+        page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
+        saved = []  # need to remove if inner a svg image
+        for i, block in enumerate(blocks[int(skip_block):]):  # head and tail of pages should be ignore
+            if 'image' in block:
+                # try:
+                    bbox = block['bbox']
+                    # skip image that width=398 and hight=137 -> Typically LOGO
+                    if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
+                        continue
+                    # Scale the bbox coordinates
+                    cropped = page_im.crop([int(i * scale) for i in bbox])
+                    number = block['number']
+                    file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
+                    image_name = file_name + '.png'
+                    # print(image_name)
+                    cropped.save(image_name)
+                    # # Handle text extraction around the image
+                    text_content = get_text_around_image(blocks[skip_block:], i, lang)
+                    title = get_title_of_image(blocks[skip_block:], i, lang)
+                    # print(text_content[:30])
+                    # print(title)
+                    with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
+                        text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
+                    saved.append((file_name, [int(i * scale) for i in bbox]))
+                # except:
+                #     pass
+        # part2: get image with description in svg-pdf
+        svg = page.get_svg_image(matrix=fitz.Identity)
+        image_clips, svg_blocks = parse_page_svg(svg, page.number)
+        for clip in image_clips:
+            transform = []
+            for item in clip[0]:
+                # print(item, type(item))
+                if item[0] == '.':
+                    transform.append(float('0' + item))
+                elif item[0] == '-':
+                    transform.append(float('-0' + item[1:]))
+                else:
+                    transform.append(float(item))
+            d = clip[1]
+            page_id = clip[2]
+            block_id = clip[3]
+            matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
+            float_values = [float(value) for value in matches[0]]
+            box_width = float_values[0]
+            box_height = float_values[1]
+            width_scale = transform[0]
+            height_scale = transform[3]
+            width_move = transform[4]
+            height_move = transform[5]
+            x1 = width_move * scale
+            y1 = height_move * scale
+            # x1=347*scale
+            # y1=587*scale
+            x2 = x1 + box_width * width_scale * scale
+            y2 = y1 + box_height * height_scale * scale
+            if y1 > y2:
+                y1, y2 = y2, y1
+            # print(x1, y1, x2, y2)
+            # 3. 截取并保存图像
+            # check images in saved, if in or similar, delete it from file system
+            for i, (file_name, bbox) in enumerate(saved):
+                if (abs(bbox[0] - x1) < 10\
+                        and abs(bbox[1] - y1) < 10\
+                        and abs(bbox[2] - x2) < 10\
+                        and abs(bbox[3] - y2) < 10) or \
+                        (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
+                    os.remove(file_name + '.png')
+                    os.remove(file_name + '.txt')
+                    saved.pop(i)
+                    break
+            cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
+            file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
+            image_name = file_name + '.png'
+            cropped_img.save(image_name)
+            # search title and text
+            text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
+            title = get_svg_title_around_image(svg_blocks, block_id, lang)
+            with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
+                text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
+    print(temp_image_dir)
+    return temp_image_dir
+def build_index(file, tmp_dir, lang='CN'):
+    # Define the schema for the index
+    if lang == 'CN':
+        schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
+    else:
+        schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
+    base_name = os.path.basename(file).split('.')[0]
+    path_name = f'{base_name}'
+    # index_path = 'indexes/' + path_name + '_index_dir'
+    index_path = 'indexes/'
+    # Create an index in a directory
+    # if os.path.exists(index_path):
+    #     shutil.rmtree(index_path)
+    # os.mkdir(index_path)
+    temp_index_dir = index_path
+    # temp_index_dir = tempfile.mkdtemp(prefix='index_')
+    global ix
+    if ix is None:
+        ix = create_in(temp_index_dir, schema)
+    global writer
+    if writer is None:
+        writer = ix.writer()
+    # Add documents to the index
+    # base_name = os.path.basename(file).split('.')[0]
+    # image_path = f'images{base_name}'
+    # writer = ix.writer()
+    for file in os.listdir(tmp_dir):
+        if file.endswith('.txt'):
+            file_path = os.path.join(tmp_dir, file)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            writer.add_document(file_name=file[:-4], content=content)
+            print('==========')
+            print(content)
+            print("==========")
+    writer.commit()
+    return ix, temp_index_dir
+def search(ix, query, lang='CN', k=10):
+    # Tokenize the query string and join tokens with OR operator
+    if lang == 'CN':
+        query_tokens = jieba.cut(query, cut_all=True)
+    else:
+        query_tokens = query.split()
+    or_query = " OR ".join(query_tokens)
+    parser = QueryParser("content", ix.schema)
+    myquery = parser.parse(or_query)
+    with ix.searcher() as searcher:
+        results = searcher.search(myquery, limit=k)
+        # Extract and return the file names and descriptions of the top-k hits
+        results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
+    return results_list
+def return_image(file, results_list, tmp_dir):
+    # base_name = os.path.basename(file).split('.')[0]
+    # path_name = f'images{base_name}'
+    titles = []
+    images = []
+    for result in results_list:
+        title = result[1].split('\n')[0].split(':')[-1]
+        titles.append(title)
+        images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
+    return titles[0], images[0]
+# file = 'CA-IS372x-datasheet_cn.pdf'
+# file = 'CA-IS3086 datasheet_cn.pdf'
+# temp_image_dir = load_pdf(file, lang='CN')
+# ix, temp_index_dir = build_index(file, temp_image_dir)
+# results_list = search(ix, "波形", lang='CN', k=10)
+# ret_img = return_image(file, results_list, temp_image_dir)
+# print('title: ' + ret_img[0])
+# ret_img[1].show()
+# print(os.listdir('using_pdfs'))
+# import tqdm
+# for file in tqdm.tqdm(os.listdir('using_pdfs')):
+#     tmd_dir = load_pdf(file)
+#     ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
+# #
+# writer.commit()
+# from whoosh.index import open_dir
+# search_ix = open_dir('indexes')
+# query = "IF-428x接收端阈值"
+# results = search(search_ix, query, lang='CN', k=10)
+# for result in results:
+#     print(result)
+#
+# from PIL import Image
+#
+# for result in results:
+#     image_name = result[0]
+#     base_name = image_name.split('_img')[0]
+#     img = Image.open('images/' + base_name + '/' + image_name + '.png')
+#     image_title = result[1].split('\n')[0].split(':')[1]
+#     img.show(title=image_title)

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

utils.py CHANGED Viewed

@@ -1,261 +1,263 @@
-import xml.etree.ElementTree as ET
-def get_adjacent_lines(blocks, block_index):
-    """
-    Returns two lists: the lines of text before and after the block at block_index.
-    Each list contains lines in order from closest to furthest from the block.
-    """
-    def is_same_line(origin1, origin2):
-        # Adjust this threshold if needed
-        THRESHOLD = 10
-        return abs(origin1[1] - origin2[1]) < THRESHOLD
-    def extract_spans_from_blocks(target_blocks):
-        spans = []
-        for block in target_blocks:
-            if 'lines' in block:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        spans.append(span)
-        return spans
-    def merge_spans_to_lines(spans):
-        if not spans:
-            return []
-        lines = []
-        current_line = spans[0]['text']
-        current_origin = spans[0]['origin']
-        for span in spans[1:]:
-            if is_same_line(span['origin'], current_origin):
-                current_line += " " + span['text']
-            else:
-                lines.append(current_line.strip())
-                current_line = span['text']
-                current_origin = span['origin']
-        lines.append(current_line.strip())
-        return lines
-    spans_before = extract_spans_from_blocks(blocks[:block_index])
-    spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
-    lines_before = merge_spans_to_lines(spans_before)
-    lines_after = merge_spans_to_lines(spans_after)
-    return lines_before, lines_after
-def get_text_around_image(blocks, image_index,  lang='CN', word_count=50):
-    before_lines, after_lines = get_adjacent_lines(blocks, image_index)
-    # print(before_lines)
-    # print(after_lines)
-    text_content = ""
-    counter = word_count
-    # Process lines before the image
-    for line in reversed(before_lines):
-        text_content = line + '\n' + text_content
-        if lang == 'CN':
-            counter -= len(line)
-        else:
-            counter -= len(line.split(' '))
-        if counter <= 0:
-            break
-    # Reset the word counter for lines after the image
-    counter = word_count
-    # Process lines after the image
-    for line in after_lines:
-        text_content += line + '\n'
-        if lang == 'CN':
-            counter -= len(line)
-        else:
-            counter -= len(line.split(' '))
-        if counter <= 0:
-            break
-    return text_content.strip()
-def get_title_of_image(blocks, image_index, lang='CN'):
-    before_lines, after_lines = get_adjacent_lines(blocks, image_index)
-    # Search for a title in the lines before the image
-    title = None
-    for line in reversed(before_lines):
-        if lang == 'CN' and '图' in line:
-            title = f"title: {line}"
-            break
-        elif 'figure' in line.lower():
-            title = f"title: {line}"
-            break
-    # Search for a title in the lines after the image
-    for line in after_lines:
-        if lang == 'CN' and '图 ' in line:
-            return f"title: {line}"
-        elif 'figure' in line.lower():
-            return f"title: {line}"
-    return title if title else "title: Not Found"
-def transform_to_array(trans):
-    trans = trans.replace('matrix(', '').replace(')', '').split(',')
-    arr = []
-    # print(trans)
-    for item in trans:
-        # print(item, type(item))
-        if item[0] == '.':
-            arr.append(float('0' + item))
-        elif item[0] == '-':
-            arr.append(float('-0'+item[1:]))
-        else:
-            arr.append(float(item))
-    return arr
-def parse_page_svg(svg, page_id):
-    # 解析SVG内容
-    root = ET.fromstring(svg)
-    # 获取页面大小
-    width = int(root.get('width').replace('pt', ''))
-    height = int(root.get('height').replace('pt', ''))
-    # 存储clipPaths
-    clips = {}
-    for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'):
-        clips[clip.get('id')] = clip
-    # 获取SVG下的第一个g标签
-    main_g = root.find('{http://www.w3.org/2000/svg}g')
-    page_size = f'H{width}V{height}'
-    gs = main_g.findall('{http://www.w3.org/2000/svg}g')
-    block_id = 0
-    img_clips = []
-    blocks = []
-    cache = ""
-    vertical = None
-    horizon = None
-    # 遍历主g标签下的所有子g标签
-    for g in main_g.findall('{http://www.w3.org/2000/svg}g'):
-        # 检查第一个子标签是否为"use"标签并且是否有"data-text"属性
-        first_child = list(g)[0] if g else None
-        if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib:
-            # get all use tags that contains data-text attribute in g tag and print them
-            for u in g.findall('{http://www.w3.org/2000/svg}use'):
-                if 'data-text' in u.attrib:
-                    text_vertical = transform_to_array(u.get('transform'))[5]
-                    text_horizon = transform_to_array(u.get('transform'))[4]
-                    if vertical is None or abs(text_vertical - vertical) > 10:
-                        vertical = text_vertical
-                        cache = cache.strip()
-                        if cache != "":
-                            blocks.append(cache)
-                            cache = u.get('data-text')
-                            block_id += 1
-                    else:
-                        # horizon should change
-                        if horizon is None or abs(text_horizon - horizon) > 1:
-                            horizon = text_horizon
-                            cache += u.get('data-text')
-            continue
-        clip_path = g.get('clip-path')
-        if clip_path and '#clip_' in clip_path:
-            clip_id = clip_path.split("#")[1].replace(')', '')
-            if clip_id in clips:
-                path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path')
-                transform = path.get('transform')
-                if not transform:
-                    continue
-                transform = transform.replace('matrix(', '').replace(')', '')
-                d = path.get('d')
-                trans_height = int(float(transform.split(',')[5]))
-                if not (page_size in d or (transform and trans_height == height)):
-                    # print(page_size in d)
-                    # print(transform and trans_height == height)
-                    # print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height)
-                    # print(f"From Transform: {transform}, D: {d} in page {page_id}")
-                    img_clips.append((transform.split(','), d, page_id, block_id))
-                    blocks.append(f'image_{block_id}')
-                    block_id += 1
-                else:
-                    for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'):
-                        sub_clip_path = sub_g.get('clip-path')
-                        if sub_clip_path and '#clip_' in sub_clip_path:
-                            sub_clip_id = sub_clip_path.split("#")[1].replace(')', '')
-                            if sub_clip_id in clips:
-                                sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path')
-                                sub_d = sub_path.get('d')
-                                sub_transform = sub_path.get('transform')
-                                sub_transform = sub_transform.replace('matrix(', '').replace(')', '')
-                                subtrans_height = int(float(sub_transform.split(',')[5]))
-                                if not (page_size in sub_d or (sub_transform and subtrans_height == height)):
-                                    # print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}")
-                                    img_clips.append((sub_transform.split(','), sub_d, page_id, block_id))
-                                    blocks.append(f'image_{block_id}')
-                                    block_id += 1
-                                    break
-    return img_clips, blocks
-def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50):
-    text_content = ""
-    counter = word_count
-    # Process lines before the image
-    for line in reversed(blocks[:block_id]):
-        text_content = line + '\n' + text_content
-        if lang == 'CN':
-            counter -= len(line)
-        else:
-            counter -= len(line.split(' '))
-        if counter <= 0:
-            break
-    # Reset the word counter for lines after the image
-    counter = word_count
-    # Process lines after the image
-    for line in blocks[block_id+1:]:
-        text_content += line + '\n'
-        if lang == 'CN':
-            counter -= len(line)
-        else:
-            counter -= len(line.split(' '))
-        if counter <= 0:
-            break
-    return text_content.strip()
-def get_svg_title_around_image(blocks, block_id, lang='CN'):
-    # Search for a title in the lines before the image
-    title = None
-    for line in reversed(blocks[:block_id]):
-        if lang == 'CN' and '图' in line:
-            title = f"title: {line}"
-            break
-        elif 'figure' in line.lower():
-            title = f"title: {line}"
-            break
-    # Search for a title in the lines after the image
-    for line in blocks[block_id+1:]:
-        if lang == 'CN' and '图 ' in line:
-            return f"title: {line}"
-        elif lang == 'CN' and '图' in line:
-            return f"title: {line}"
-        elif 'figure' in line.lower():
-            return f"title: {line}"
-    return title if title else "title: Not Found"

+import xml.etree.ElementTree as ET
+def get_adjacent_lines(blocks, block_index):
+    """
+    Returns two lists: the lines of text before and after the block at block_index.
+    Each list contains lines in order from closest to furthest from the block.
+    """
+    def is_same_line(origin1, origin2):
+        # Adjust this threshold if needed
+        THRESHOLD = 10
+        return abs(origin1[1] - origin2[1]) < THRESHOLD
+    def extract_spans_from_blocks(target_blocks):
+        spans = []
+        for block in target_blocks:
+            if 'lines' in block:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        spans.append(span)
+        return spans
+    def merge_spans_to_lines(spans):
+        if not spans:
+            return []
+        lines = []
+        current_line = spans[0]['text']
+        current_origin = spans[0]['origin']
+        for span in spans[1:]:
+            if is_same_line(span['origin'], current_origin):
+                current_line += " " + span['text']
+            else:
+                lines.append(current_line.strip())
+                current_line = span['text']
+                current_origin = span['origin']
+        lines.append(current_line.strip())
+        return lines
+    spans_before = extract_spans_from_blocks(blocks[:block_index])
+    spans_after = extract_spans_from_blocks(blocks[block_index + 1:])
+    lines_before = merge_spans_to_lines(spans_before)
+    lines_after = merge_spans_to_lines(spans_after)
+    return lines_before, lines_after
+def get_text_around_image(blocks, image_index,  lang='CN', word_count=50):
+    before_lines, after_lines = get_adjacent_lines(blocks, image_index)
+    # print(before_lines)
+    # print(after_lines)
+    text_content = ""
+    counter = word_count
+    # Process lines before the image
+    for line in reversed(before_lines):
+        text_content = line + '\n' + text_content
+        if lang == 'CN':
+            counter -= len(line)
+        else:
+            counter -= len(line.split(' '))
+        if counter <= 0:
+            break
+    # Reset the word counter for lines after the image
+    counter = word_count
+    # Process lines after the image
+    for line in after_lines:
+        text_content += line + '\n'
+        if lang == 'CN':
+            counter -= len(line)
+        else:
+            counter -= len(line.split(' '))
+        if counter <= 0:
+            break
+    return text_content.strip()
+def get_title_of_image(blocks, image_index, lang='CN'):
+    before_lines, after_lines = get_adjacent_lines(blocks, image_index)
+    # Search for a title in the lines before the image
+    title = None
+    for line in reversed(before_lines):
+        if lang == 'CN' and '图' in line:
+            title = f"title: {line}"
+            break
+        elif 'figure' in line.lower():
+            title = f"title: {line}"
+            break
+    # Search for a title in the lines after the image
+    for line in after_lines:
+        if lang == 'CN' and '图 ' in line:
+            return f"title: {line}"
+        elif 'figure' in line.lower():
+            return f"title: {line}"
+    if before_lines:
+        title = before_lines[-1]
+    return title if title else "title: Not Found"
+def transform_to_array(trans):
+    trans = trans.replace('matrix(', '').replace(')', '').split(',')
+    arr = []
+    # print(trans)
+    for item in trans:
+        # print(item, type(item))
+        if item[0] == '.':
+            arr.append(float('0' + item))
+        elif item[0] == '-':
+            arr.append(float('-0'+item[1:]))
+        else:
+            arr.append(float(item))
+    return arr
+def parse_page_svg(svg, page_id):
+    # 解析SVG内容
+    root = ET.fromstring(svg)
+    # 获取页面大小
+    width = int(float(root.get('width').replace('pt', '')))
+    height = int(float(root.get('height').replace('pt', '')))
+    # 存储clipPaths
+    clips = {}
+    for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'):
+        clips[clip.get('id')] = clip
+    # 获取SVG下的第一个g标签
+    main_g = root.find('{http://www.w3.org/2000/svg}g')
+    page_size = f'H{width}V{height}'
+    gs = main_g.findall('{http://www.w3.org/2000/svg}g')
+    block_id = 0
+    img_clips = []
+    blocks = []
+    cache = ""
+    vertical = None
+    horizon = None
+    # 遍历主g标签下的所有子g标签
+    for g in main_g.findall('{http://www.w3.org/2000/svg}g'):
+        # 检查第一个子标签是否为"use"标签并且是否有"data-text"属性
+        first_child = list(g)[0] if g else None
+        if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib:
+            # get all use tags that contains data-text attribute in g tag and print them
+            for u in g.findall('{http://www.w3.org/2000/svg}use'):
+                if 'data-text' in u.attrib:
+                    text_vertical = transform_to_array(u.get('transform'))[5]
+                    text_horizon = transform_to_array(u.get('transform'))[4]
+                    if vertical is None or abs(text_vertical - vertical) > 10:
+                        vertical = text_vertical
+                        cache = cache.strip()
+                        if cache != "":
+                            blocks.append(cache)
+                            cache = u.get('data-text')
+                            block_id += 1
+                    else:
+                        # horizon should change
+                        if horizon is None or abs(text_horizon - horizon) > 1:
+                            horizon = text_horizon
+                            cache += u.get('data-text')
+            continue
+        clip_path = g.get('clip-path')
+        if clip_path and '#clip_' in clip_path:
+            clip_id = clip_path.split("#")[1].replace(')', '')
+            if clip_id in clips:
+                path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path')
+                transform = path.get('transform')
+                if not transform:
+                    continue
+                transform = transform.replace('matrix(', '').replace(')', '')
+                d = path.get('d')
+                trans_height = int(float(transform.split(',')[5]))
+                if not (page_size in d or (transform and trans_height == height)):
+                    # print(page_size in d)
+                    # print(transform and trans_height == height)
+                    # print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height)
+                    # print(f"From Transform: {transform}, D: {d} in page {page_id}")
+                    img_clips.append((transform.split(','), d, page_id, block_id))
+                    blocks.append(f'image_{block_id}')
+                    block_id += 1
+                else:
+                    for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'):
+                        sub_clip_path = sub_g.get('clip-path')
+                        if sub_clip_path and '#clip_' in sub_clip_path:
+                            sub_clip_id = sub_clip_path.split("#")[1].replace(')', '')
+                            if sub_clip_id in clips:
+                                sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path')
+                                sub_d = sub_path.get('d')
+                                sub_transform = sub_path.get('transform')
+                                sub_transform = sub_transform.replace('matrix(', '').replace(')', '')
+                                subtrans_height = int(float(sub_transform.split(',')[5]))
+                                if not (page_size in sub_d or (sub_transform and subtrans_height == height)):
+                                    # print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}")
+                                    img_clips.append((sub_transform.split(','), sub_d, page_id, block_id))
+                                    blocks.append(f'image_{block_id}')
+                                    block_id += 1
+                                    break
+    return img_clips, blocks
+def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50):
+    text_content = ""
+    counter = word_count
+    # Process lines before the image
+    for line in reversed(blocks[:block_id]):
+        text_content = line + '\n' + text_content
+        if lang == 'CN':
+            counter -= len(line)
+        else:
+            counter -= len(line.split(' '))
+        if counter <= 0:
+            break
+    # Reset the word counter for lines after the image
+    counter = word_count
+    # Process lines after the image
+    for line in blocks[block_id+1:]:
+        text_content += line + '\n'
+        if lang == 'CN':
+            counter -= len(line)
+        else:
+            counter -= len(line.split(' '))
+        if counter <= 0:
+            break
+    return text_content.strip()
+def get_svg_title_around_image(blocks, block_id, lang='CN'):
+    # Search for a title in the lines before the image
+    title = None
+    for line in reversed(blocks[:block_id]):
+        if lang == 'CN' and '图' in line:
+            title = f"title: {line}"
+            break
+        elif 'figure' in line.lower():
+            title = f"title: {line}"
+            break
+    # Search for a title in the lines after the image
+    for line in blocks[block_id+1:]:
+        if lang == 'CN' and '图 ' in line:
+            return f"title: {line}"
+        elif lang == 'CN' and '图' in line:
+            return f"title: {line}"
+        elif 'figure' in line.lower():
+            return f"title: {line}"
+    return title if title else "title: Not Found"