Spaces:

Teboo
/

pdfImgSearcher

Runtime error

App Files Files Community

Teboo commited on May 13, 2024

Commit

a8696e1

verified ·

1 Parent(s): 127b2c3

Update pdfImage.py

Browse files

Files changed (1) hide show

pdfImage.py +292 -292

pdfImage.py CHANGED Viewed

@@ -1,292 +1,292 @@
-import fitz
-from PIL import Image
-from utils import *
-from whoosh.analysis import Tokenizer, Token
-import jieba
-from whoosh.index import create_in
-from whoosh.fields import *
-from whoosh.qparser import QueryParser
-import os
-import shutil
-# import tempfile
-LOGO_WIDTH = 398
-LOGO_HEIGHT = 137
-ix = None
-writer = None
-class ChineseTokenizer(Tokenizer):
-    def __call__(self, value, positions=False, chars=False,
-                 keeporiginal=False, removestops=True,
-                 start_pos=0, start_char=0, mode='', **kwargs):
-        t = Token(positions, chars, removestops=removestops, mode=mode,
-                  **kwargs)
-        seglist = jieba.cut(value, cut_all=True)
-        for w in seglist:
-            t.original = t.text = w
-            t.boost = 1.0
-            if positions:
-                t.pos = start_pos + value.find(w)
-            if chars:
-                t.startchar = start_char + value.find(w)
-            if chars and positions:
-                t.endchar = start_char + value.find(w) + len(w)
-            yield t
-def ChineseAnalyzer():
-    return ChineseTokenizer()
-def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
-    """
-    Load pdf file, covert to image, description and index it
-    :param lang:
-    :param skip_block:
-    :param skip_page_back:
-    :param skip_page_front:
-    :param dpi:
-    :param file:
-    :return:
-    """
-    if file.__contains__('\\gradio\\'):
-        print('gradio file')
-        doc = fitz.open(file)
-    else:
-        print('local file')
-        doc = fitz.open('using_pdfs/' + file)
-    # load pages
-    pages = []
-    for i in range(doc.page_count):
-        page = doc.load_page(i)
-        pages.append(page)
-    # increase dpi to 300
-    dpi = int(dpi)
-    scale = dpi / 72  # default dpi of pdf is 72
-    matrix = fitz.Matrix(scale, scale)
-    skip_block = int(skip_block)
-    base_name = os.path.basename(file).split('.')[0]
-    path_name = f'images/{base_name}'
-    if os.path.exists(path_name):
-        shutil.rmtree(path_name)
-    os.mkdir(path_name)
-    temp_image_dir = path_name
-    # temp_image_dir = tempfile.mkdtemp(prefix='images_')
-    for page in pages[int(skip_page_front):-int(skip_page_back)]:  # skip final page
-        # part1: get image with description in png-pdf
-        p1dict = page.get_text('dict')
-        blocks = p1dict['blocks']
-        page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
-        page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
-        saved = []  # need to remove if inner a svg image
-        for i, block in enumerate(blocks[int(skip_block):]):  # head and tail of pages should be ignore
-            if 'image' in block:
-                # try:
-                    bbox = block['bbox']
-                    # skip image that width=398 and hight=137 -> Typically LOGO
-                    if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
-                        continue
-                    # Scale the bbox coordinates
-                    cropped = page_im.crop([int(i * scale) for i in bbox])
-                    number = block['number']
-                    file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
-                    image_name = file_name + '.png'
-                    # print(image_name)
-                    cropped.save(image_name)
-                    # # Handle text extraction around the image
-                    text_content = get_text_around_image(blocks[skip_block:], i, lang)
-                    title = get_title_of_image(blocks[skip_block:], i, lang)
-                    # print(text_content[:30])
-                    # print(title)
-                    with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
-                        text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
-                    saved.append((file_name, [int(i * scale) for i in bbox]))
-                # except:
-                #     pass
-        # part2: get image with description in svg-pdf
-        svg = page.get_svg_image(matrix=fitz.Identity)
-        image_clips, svg_blocks = parse_page_svg(svg, page.number)
-        for clip in image_clips:
-            transform = []
-            for item in clip[0]:
-                # print(item, type(item))
-                if item[0] == '.':
-                    transform.append(float('0' + item))
-                elif item[0] == '-':
-                    transform.append(float('-0' + item[1:]))
-                else:
-                    transform.append(float(item))
-            d = clip[1]
-            page_id = clip[2]
-            block_id = clip[3]
-            matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
-            float_values = [float(value) for value in matches[0]]
-            box_width = float_values[0]
-            box_height = float_values[1]
-            width_scale = transform[0]
-            height_scale = transform[3]
-            width_move = transform[4]
-            height_move = transform[5]
-            x1 = width_move * scale
-            y1 = height_move * scale
-            # x1=347*scale
-            # y1=587*scale
-            x2 = x1 + box_width * width_scale * scale
-            y2 = y1 + box_height * height_scale * scale
-            if y1 > y2:
-                y1, y2 = y2, y1
-            # print(x1, y1, x2, y2)
-            # 3. 截取并保存图像
-            # check images in saved, if in or similar, delete it from file system
-            for i, (file_name, bbox) in enumerate(saved):
-                if (abs(bbox[0] - x1) < 10\
-                        and abs(bbox[1] - y1) < 10\
-                        and abs(bbox[2] - x2) < 10\
-                        and abs(bbox[3] - y2) < 10) or \
-                        (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
-                    os.remove(file_name + '.png')
-                    os.remove(file_name + '.txt')
-                    saved.pop(i)
-                    break
-            cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
-            file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
-            image_name = file_name + '.png'
-            cropped_img.save(image_name)
-            # search title and text
-            text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
-            title = get_svg_title_around_image(svg_blocks, block_id, lang)
-            with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
-                text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
-    print(temp_image_dir)
-    return temp_image_dir
-def build_index(file, tmp_dir, lang='CN'):
-    # Define the schema for the index
-    if lang == 'CN':
-        schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
-    else:
-        schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
-    base_name = os.path.basename(file).split('.')[0]
-    path_name = f'{base_name}'
-    # index_path = 'indexes/' + path_name + '_index_dir'
-    index_path = 'indexes/'
-    # Create an index in a directory
-    # if os.path.exists(index_path):
-    #     shutil.rmtree(index_path)
-    # os.mkdir(index_path)
-    temp_index_dir = index_path
-    # temp_index_dir = tempfile.mkdtemp(prefix='index_')
-    global ix
-    if ix is None:
-        ix = create_in(temp_index_dir, schema)
-    global writer
-    if writer is None:
-        writer = ix.writer()
-    # Add documents to the index
-    # base_name = os.path.basename(file).split('.')[0]
-    # image_path = f'images{base_name}'
-    # writer = ix.writer()
-    for file in os.listdir(tmp_dir):
-        if file.endswith('.txt'):
-            file_path = os.path.join(tmp_dir, file)
-            with open(file_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            writer.add_document(file_name=file[:-4], content=content)
-            print('==========')
-            print(content)
-            print("==========")
-    writer.commit()
-    return ix, temp_index_dir
-def search(ix, query, lang='CN', k=10):
-    # Tokenize the query string and join tokens with OR operator
-    if lang == 'CN':
-        query_tokens = jieba.cut(query, cut_all=True)
-    else:
-        query_tokens = query.split()
-    or_query = " OR ".join(query_tokens)
-    parser = QueryParser("content", ix.schema)
-    myquery = parser.parse(or_query)
-    with ix.searcher() as searcher:
-        results = searcher.search(myquery, limit=k)
-        # Extract and return the file names and descriptions of the top-k hits
-        results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
-    return results_list
-def return_image(file, results_list, tmp_dir):
-    # base_name = os.path.basename(file).split('.')[0]
-    # path_name = f'images{base_name}'
-    titles = []
-    images = []
-    for result in results_list:
-        title = result[1].split('\n')[0].split(':')[-1]
-        titles.append(title)
-        images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
-    return titles[0], images[0]
-# file = 'CA-IS372x-datasheet_cn.pdf'
-# file = 'CA-IS3086 datasheet_cn.pdf'
-# temp_image_dir = load_pdf(file, lang='CN')
-# ix, temp_index_dir = build_index(file, temp_image_dir)
-# results_list = search(ix, "波形", lang='CN', k=10)
-# ret_img = return_image(file, results_list, temp_image_dir)
-# print('title: ' + ret_img[0])
-# ret_img[1].show()
-# print(os.listdir('using_pdfs'))
-# import tqdm
-# for file in tqdm.tqdm(os.listdir('using_pdfs')):
-#     tmd_dir = load_pdf(file)
-#     ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
-# #
-# writer.commit()
-# from whoosh.index import open_dir
-# search_ix = open_dir('indexes')
-# query = "IF-428x接收端阈值"
-# results = search(search_ix, query, lang='CN', k=10)
-# for result in results:
-#     print(result)
-#
-# from PIL import Image
-#
-# for result in results:
-#     image_name = result[0]
-#     base_name = image_name.split('_img')[0]
-#     img = Image.open('images/' + base_name + '/' + image_name + '.png')
-#     image_title = result[1].split('\n')[0].split(':')[1]
-#     img.show(title=image_title)

+import fitz
+from PIL import Image
+from utils import *
+from whoosh.analysis import Tokenizer, Token
+import jieba
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+import os
+import shutil
+# import tempfile
+LOGO_WIDTH = 398
+LOGO_HEIGHT = 137
+ix = None
+writer = None
+class ChineseTokenizer(Tokenizer):
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False, removestops=True,
+                 start_pos=0, start_char=0, mode='', **kwargs):
+        t = Token(positions, chars, removestops=removestops, mode=mode,
+                  **kwargs)
+        seglist = jieba.cut(value, cut_all=True)
+        for w in seglist:
+            t.original = t.text = w
+            t.boost = 1.0
+            if positions:
+                t.pos = start_pos + value.find(w)
+            if chars:
+                t.startchar = start_char + value.find(w)
+            if chars and positions:
+                t.endchar = start_char + value.find(w) + len(w)
+            yield t
+def ChineseAnalyzer():
+    return ChineseTokenizer()
+def load_pdf(file, dpi=300, skip_page_front=0, skip_page_back=1, skip_block=5, lang='CN'):
+    """
+    Load pdf file, covert to image, description and index it
+    :param lang:
+    :param skip_block:
+    :param skip_page_back:
+    :param skip_page_front:
+    :param dpi:
+    :param file:
+    :return:
+    """
+    if file.__contains__('\\gradio\\') or file.__contains__('/gradio/'):
+        print('gradio file')
+        doc = fitz.open(file)
+    else:
+        print('local file')
+        doc = fitz.open('using_pdfs/' + file)
+    # load pages
+    pages = []
+    for i in range(doc.page_count):
+        page = doc.load_page(i)
+        pages.append(page)
+    # increase dpi to 300
+    dpi = int(dpi)
+    scale = dpi / 72  # default dpi of pdf is 72
+    matrix = fitz.Matrix(scale, scale)
+    skip_block = int(skip_block)
+    base_name = os.path.basename(file).split('.')[0]
+    path_name = f'images/{base_name}'
+    if os.path.exists(path_name):
+        shutil.rmtree(path_name)
+    os.mkdir(path_name)
+    temp_image_dir = path_name
+    # temp_image_dir = tempfile.mkdtemp(prefix='images_')
+    for page in pages[int(skip_page_front):-int(skip_page_back)]:  # skip final page
+        # part1: get image with description in png-pdf
+        p1dict = page.get_text('dict')
+        blocks = p1dict['blocks']
+        page_pix = page.get_pixmap(matrix=matrix, dpi=dpi)
+        page_im = Image.frombytes("RGB", (page_pix.width, page_pix.height), page_pix.samples)
+        saved = []  # need to remove if inner a svg image
+        for i, block in enumerate(blocks[int(skip_block):]):  # head and tail of pages should be ignore
+            if 'image' in block:
+                # try:
+                    bbox = block['bbox']
+                    # skip image that width=398 and hight=137 -> Typically LOGO
+                    if (bbox[2] - bbox[0])*scale - LOGO_WIDTH <= 10 and (bbox[3] - bbox[1])*scale - LOGO_HEIGHT <= 10:
+                        continue
+                    # Scale the bbox coordinates
+                    cropped = page_im.crop([int(i * scale) for i in bbox])
+                    number = block['number']
+                    file_name = temp_image_dir + f'/{base_name}_imgbmp_{page.number}_{number}'
+                    image_name = file_name + '.png'
+                    # print(image_name)
+                    cropped.save(image_name)
+                    # # Handle text extraction around the image
+                    text_content = get_text_around_image(blocks[skip_block:], i, lang)
+                    title = get_title_of_image(blocks[skip_block:], i, lang)
+                    # print(text_content[:30])
+                    # print(title)
+                    with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
+                        text_file.write(title + '\n' + text_content.replace('\n', ' ')+ f'\nbase name:{base_name}')
+                    saved.append((file_name, [int(i * scale) for i in bbox]))
+                # except:
+                #     pass
+        # part2: get image with description in svg-pdf
+        svg = page.get_svg_image(matrix=fitz.Identity)
+        image_clips, svg_blocks = parse_page_svg(svg, page.number)
+        for clip in image_clips:
+            transform = []
+            for item in clip[0]:
+                # print(item, type(item))
+                if item[0] == '.':
+                    transform.append(float('0' + item))
+                elif item[0] == '-':
+                    transform.append(float('-0' + item[1:]))
+                else:
+                    transform.append(float(item))
+            d = clip[1]
+            page_id = clip[2]
+            block_id = clip[3]
+            matches = re.findall(r'H(\d+\.?\d*)V(\d+\.?\d*)', d)
+            float_values = [float(value) for value in matches[0]]
+            box_width = float_values[0]
+            box_height = float_values[1]
+            width_scale = transform[0]
+            height_scale = transform[3]
+            width_move = transform[4]
+            height_move = transform[5]
+            x1 = width_move * scale
+            y1 = height_move * scale
+            # x1=347*scale
+            # y1=587*scale
+            x2 = x1 + box_width * width_scale * scale
+            y2 = y1 + box_height * height_scale * scale
+            if y1 > y2:
+                y1, y2 = y2, y1
+            # print(x1, y1, x2, y2)
+            # 3. 截取并保存图像
+            # check images in saved, if in or similar, delete it from file system
+            for i, (file_name, bbox) in enumerate(saved):
+                if (abs(bbox[0] - x1) < 10\
+                        and abs(bbox[1] - y1) < 10\
+                        and abs(bbox[2] - x2) < 10\
+                        and abs(bbox[3] - y2) < 10) or \
+                        (bbox[0]>x1-10 and bbox[1]>y1-10 and bbox[2]<x2+10 and bbox[3]<y2+10):
+                    os.remove(file_name + '.png')
+                    os.remove(file_name + '.txt')
+                    saved.pop(i)
+                    break
+            cropped_img = page_im.crop((int(x1), int(y1), int(x2), int(y2)))
+            file_name = temp_image_dir + f'/{base_name}_imgsvg_{page.number}_{block_id}'
+            image_name = file_name + '.png'
+            cropped_img.save(image_name)
+            # search title and text
+            text_content = get_svg_text_around_image(svg_blocks, block_id, lang)
+            title = get_svg_title_around_image(svg_blocks, block_id, lang)
+            with open(f'{file_name}.txt', 'w', encoding='utf-8') as text_file:
+                text_file.write(title + '\n' + text_content.replace('\n', ' ') + f'\nbase name:{base_name}')
+    print(temp_image_dir)
+    return temp_image_dir
+def build_index(file, tmp_dir, lang='CN'):
+    # Define the schema for the index
+    if lang == 'CN':
+        schema = Schema(file_name=ID(stored=True), content=TEXT(analyzer=ChineseAnalyzer(), stored=True))
+    else:
+        schema = Schema(file_name=ID(stored=True), content=TEXT(stored=True))
+    base_name = os.path.basename(file).split('.')[0]
+    path_name = f'{base_name}'
+    # index_path = 'indexes/' + path_name + '_index_dir'
+    index_path = 'indexes/'
+    # Create an index in a directory
+    # if os.path.exists(index_path):
+    #     shutil.rmtree(index_path)
+    # os.mkdir(index_path)
+    temp_index_dir = index_path
+    # temp_index_dir = tempfile.mkdtemp(prefix='index_')
+    global ix
+    if ix is None:
+        ix = create_in(temp_index_dir, schema)
+    global writer
+    if writer is None:
+        writer = ix.writer()
+    # Add documents to the index
+    # base_name = os.path.basename(file).split('.')[0]
+    # image_path = f'images{base_name}'
+    # writer = ix.writer()
+    for file in os.listdir(tmp_dir):
+        if file.endswith('.txt'):
+            file_path = os.path.join(tmp_dir, file)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            writer.add_document(file_name=file[:-4], content=content)
+            print('==========')
+            print(content)
+            print("==========")
+    writer.commit()
+    return ix, temp_index_dir
+def search(ix, query, lang='CN', k=10):
+    # Tokenize the query string and join tokens with OR operator
+    if lang == 'CN':
+        query_tokens = jieba.cut(query, cut_all=True)
+    else:
+        query_tokens = query.split()
+    or_query = " OR ".join(query_tokens)
+    parser = QueryParser("content", ix.schema)
+    myquery = parser.parse(or_query)
+    with ix.searcher() as searcher:
+        results = searcher.search(myquery, limit=k)
+        # Extract and return the file names and descriptions of the top-k hits
+        results_list = [(hit['file_name'], hit['content'], hit.score) for hit in results]
+    return results_list
+def return_image(file, results_list, tmp_dir):
+    # base_name = os.path.basename(file).split('.')[0]
+    # path_name = f'images{base_name}'
+    titles = []
+    images = []
+    for result in results_list:
+        title = result[1].split('\n')[0].split(':')[-1]
+        titles.append(title)
+        images.append(Image.open(tmp_dir + '/' + result[0] + '.png'))
+    return titles[0], images[0]
+# file = 'CA-IS372x-datasheet_cn.pdf'
+# file = 'CA-IS3086 datasheet_cn.pdf'
+# temp_image_dir = load_pdf(file, lang='CN')
+# ix, temp_index_dir = build_index(file, temp_image_dir)
+# results_list = search(ix, "波形", lang='CN', k=10)
+# ret_img = return_image(file, results_list, temp_image_dir)
+# print('title: ' + ret_img[0])
+# ret_img[1].show()
+# print(os.listdir('using_pdfs'))
+# import tqdm
+# for file in tqdm.tqdm(os.listdir('using_pdfs')):
+#     tmd_dir = load_pdf(file)
+#     ix, tmp_index_dir = build_index('using_pdfs/' + file, tmd_dir)
+# #
+# writer.commit()
+# from whoosh.index import open_dir
+# search_ix = open_dir('indexes')
+# query = "IF-428x接收端阈值"
+# results = search(search_ix, query, lang='CN', k=10)
+# for result in results:
+#     print(result)
+#
+# from PIL import Image
+#
+# for result in results:
+#     image_name = result[0]
+#     base_name = image_name.split('_img')[0]
+#     img = Image.open('images/' + base_name + '/' + image_name + '.png')
+#     image_title = result[1].split('\n')[0].split(':')[1]
+#     img.show(title=image_title)