import xml.etree.ElementTree as ET def get_adjacent_lines(blocks, block_index): """ Returns two lists: the lines of text before and after the block at block_index. Each list contains lines in order from closest to furthest from the block. """ def is_same_line(origin1, origin2): # Adjust this threshold if needed THRESHOLD = 10 return abs(origin1[1] - origin2[1]) < THRESHOLD def extract_spans_from_blocks(target_blocks): spans = [] for block in target_blocks: if 'lines' in block: for line in block['lines']: for span in line['spans']: spans.append(span) return spans def merge_spans_to_lines(spans): if not spans: return [] lines = [] current_line = spans[0]['text'] current_origin = spans[0]['origin'] for span in spans[1:]: if is_same_line(span['origin'], current_origin): current_line += " " + span['text'] else: lines.append(current_line.strip()) current_line = span['text'] current_origin = span['origin'] lines.append(current_line.strip()) return lines spans_before = extract_spans_from_blocks(blocks[:block_index]) spans_after = extract_spans_from_blocks(blocks[block_index + 1:]) lines_before = merge_spans_to_lines(spans_before) lines_after = merge_spans_to_lines(spans_after) return lines_before, lines_after def get_text_around_image(blocks, image_index, lang='CN', word_count=50): before_lines, after_lines = get_adjacent_lines(blocks, image_index) # print(before_lines) # print(after_lines) text_content = "" counter = word_count # Process lines before the image for line in reversed(before_lines): text_content = line + '\n' + text_content if lang == 'CN': counter -= len(line) else: counter -= len(line.split(' ')) if counter <= 0: break # Reset the word counter for lines after the image counter = word_count # Process lines after the image for line in after_lines: text_content += line + '\n' if lang == 'CN': counter -= len(line) else: counter -= len(line.split(' ')) if counter <= 0: break return text_content.strip() def get_title_of_image(blocks, image_index, lang='CN'): before_lines, after_lines = get_adjacent_lines(blocks, image_index) # Search for a title in the lines before the image title = None for line in reversed(before_lines): if lang == 'CN' and '图' in line: title = f"title: {line}" break elif 'figure' in line.lower(): title = f"title: {line}" break # Search for a title in the lines after the image for line in after_lines: if lang == 'CN' and '图 ' in line: return f"title: {line}" elif 'figure' in line.lower(): return f"title: {line}" if before_lines: title = before_lines[-1] return title if title else "title: Not Found" def transform_to_array(trans): trans = trans.replace('matrix(', '').replace(')', '').split(',') arr = [] # print(trans) for item in trans: # print(item, type(item)) if item[0] == '.': arr.append(float('0' + item)) elif item[0] == '-': arr.append(float('-0'+item[1:])) else: arr.append(float(item)) return arr def parse_page_svg(svg, page_id): # 解析SVG内容 root = ET.fromstring(svg) # 获取页面大小 width = int(float(root.get('width').replace('pt', ''))) height = int(float(root.get('height').replace('pt', ''))) # 存储clipPaths clips = {} for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'): clips[clip.get('id')] = clip # 获取SVG下的第一个g标签 main_g = root.find('{http://www.w3.org/2000/svg}g') page_size = f'H{width}V{height}' gs = main_g.findall('{http://www.w3.org/2000/svg}g') block_id = 0 img_clips = [] blocks = [] cache = "" vertical = None horizon = None # 遍历主g标签下的所有子g标签 for g in main_g.findall('{http://www.w3.org/2000/svg}g'): # 检查第一个子标签是否为"use"标签并且是否有"data-text"属性 first_child = list(g)[0] if g else None if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib: # get all use tags that contains data-text attribute in g tag and print them for u in g.findall('{http://www.w3.org/2000/svg}use'): if 'data-text' in u.attrib: text_vertical = transform_to_array(u.get('transform'))[5] text_horizon = transform_to_array(u.get('transform'))[4] if vertical is None or abs(text_vertical - vertical) > 10: vertical = text_vertical cache = cache.strip() if cache != "": blocks.append(cache) cache = u.get('data-text') block_id += 1 else: # horizon should change if horizon is None or abs(text_horizon - horizon) > 1: horizon = text_horizon cache += u.get('data-text') continue clip_path = g.get('clip-path') if clip_path and '#clip_' in clip_path: clip_id = clip_path.split("#")[1].replace(')', '') if clip_id in clips: path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path') transform = path.get('transform') if not transform: continue transform = transform.replace('matrix(', '').replace(')', '') d = path.get('d') trans_height = int(float(transform.split(',')[5])) if not (page_size in d or (transform and trans_height == height)): # print(page_size in d) # print(transform and trans_height == height) # print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height) # print(f"From Transform: {transform}, D: {d} in page {page_id}") img_clips.append((transform.split(','), d, page_id, block_id)) blocks.append(f'image_{block_id}') block_id += 1 else: for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'): sub_clip_path = sub_g.get('clip-path') if sub_clip_path and '#clip_' in sub_clip_path: sub_clip_id = sub_clip_path.split("#")[1].replace(')', '') if sub_clip_id in clips: sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path') sub_d = sub_path.get('d') sub_transform = sub_path.get('transform') sub_transform = sub_transform.replace('matrix(', '').replace(')', '') subtrans_height = int(float(sub_transform.split(',')[5])) if not (page_size in sub_d or (sub_transform and subtrans_height == height)): # print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}") img_clips.append((sub_transform.split(','), sub_d, page_id, block_id)) blocks.append(f'image_{block_id}') block_id += 1 break return img_clips, blocks def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50): text_content = "" counter = word_count # Process lines before the image for line in reversed(blocks[:block_id]): text_content = line + '\n' + text_content if lang == 'CN': counter -= len(line) else: counter -= len(line.split(' ')) if counter <= 0: break # Reset the word counter for lines after the image counter = word_count # Process lines after the image for line in blocks[block_id+1:]: text_content += line + '\n' if lang == 'CN': counter -= len(line) else: counter -= len(line.split(' ')) if counter <= 0: break return text_content.strip() def get_svg_title_around_image(blocks, block_id, lang='CN'): # Search for a title in the lines before the image title = None for line in reversed(blocks[:block_id]): if lang == 'CN' and '图' in line: title = f"title: {line}" break elif 'figure' in line.lower(): title = f"title: {line}" break # Search for a title in the lines after the image for line in blocks[block_id+1:]: if lang == 'CN' and '图 ' in line: return f"title: {line}" elif lang == 'CN' and '图' in line: return f"title: {line}" elif 'figure' in line.lower(): return f"title: {line}" return title if title else "title: Not Found"