Spaces:
Sleeping
Sleeping
import xml.etree.ElementTree as ET | |
def get_adjacent_lines(blocks, block_index): | |
""" | |
Returns two lists: the lines of text before and after the block at block_index. | |
Each list contains lines in order from closest to furthest from the block. | |
""" | |
def is_same_line(origin1, origin2): | |
# Adjust this threshold if needed | |
THRESHOLD = 10 | |
return abs(origin1[1] - origin2[1]) < THRESHOLD | |
def extract_spans_from_blocks(target_blocks): | |
spans = [] | |
for block in target_blocks: | |
if 'lines' in block: | |
for line in block['lines']: | |
for span in line['spans']: | |
spans.append(span) | |
return spans | |
def merge_spans_to_lines(spans): | |
if not spans: | |
return [] | |
lines = [] | |
current_line = spans[0]['text'] | |
current_origin = spans[0]['origin'] | |
for span in spans[1:]: | |
if is_same_line(span['origin'], current_origin): | |
current_line += " " + span['text'] | |
else: | |
lines.append(current_line.strip()) | |
current_line = span['text'] | |
current_origin = span['origin'] | |
lines.append(current_line.strip()) | |
return lines | |
spans_before = extract_spans_from_blocks(blocks[:block_index]) | |
spans_after = extract_spans_from_blocks(blocks[block_index + 1:]) | |
lines_before = merge_spans_to_lines(spans_before) | |
lines_after = merge_spans_to_lines(spans_after) | |
return lines_before, lines_after | |
def get_text_around_image(blocks, image_index, lang='CN', word_count=50): | |
before_lines, after_lines = get_adjacent_lines(blocks, image_index) | |
# print(before_lines) | |
# print(after_lines) | |
text_content = "" | |
counter = word_count | |
# Process lines before the image | |
for line in reversed(before_lines): | |
text_content = line + '\n' + text_content | |
if lang == 'CN': | |
counter -= len(line) | |
else: | |
counter -= len(line.split(' ')) | |
if counter <= 0: | |
break | |
# Reset the word counter for lines after the image | |
counter = word_count | |
# Process lines after the image | |
for line in after_lines: | |
text_content += line + '\n' | |
if lang == 'CN': | |
counter -= len(line) | |
else: | |
counter -= len(line.split(' ')) | |
if counter <= 0: | |
break | |
return text_content.strip() | |
def get_title_of_image(blocks, image_index, lang='CN'): | |
before_lines, after_lines = get_adjacent_lines(blocks, image_index) | |
# Search for a title in the lines before the image | |
title = None | |
for line in reversed(before_lines): | |
if lang == 'CN' and '图' in line: | |
title = f"title: {line}" | |
break | |
elif 'figure' in line.lower(): | |
title = f"title: {line}" | |
break | |
# Search for a title in the lines after the image | |
for line in after_lines: | |
if lang == 'CN' and '图 ' in line: | |
return f"title: {line}" | |
elif 'figure' in line.lower(): | |
return f"title: {line}" | |
if before_lines: | |
title = before_lines[-1] | |
return title if title else "title: Not Found" | |
def transform_to_array(trans): | |
trans = trans.replace('matrix(', '').replace(')', '').split(',') | |
arr = [] | |
# print(trans) | |
for item in trans: | |
# print(item, type(item)) | |
if item[0] == '.': | |
arr.append(float('0' + item)) | |
elif item[0] == '-': | |
arr.append(float('-0'+item[1:])) | |
else: | |
arr.append(float(item)) | |
return arr | |
def parse_page_svg(svg, page_id): | |
# 解析SVG内容 | |
root = ET.fromstring(svg) | |
# 获取页面大小 | |
width = int(float(root.get('width').replace('pt', ''))) | |
height = int(float(root.get('height').replace('pt', ''))) | |
# 存储clipPaths | |
clips = {} | |
for clip in root.findall('.//{http://www.w3.org/2000/svg}clipPath'): | |
clips[clip.get('id')] = clip | |
# 获取SVG下的第一个g标签 | |
main_g = root.find('{http://www.w3.org/2000/svg}g') | |
page_size = f'H{width}V{height}' | |
gs = main_g.findall('{http://www.w3.org/2000/svg}g') | |
block_id = 0 | |
img_clips = [] | |
blocks = [] | |
cache = "" | |
vertical = None | |
horizon = None | |
# 遍历主g标签下的所有子g标签 | |
for g in main_g.findall('{http://www.w3.org/2000/svg}g'): | |
# 检查第一个子标签是否为"use"标签并且是否有"data-text"属性 | |
first_child = list(g)[0] if g else None | |
if first_child is not None and first_child.tag == "{http://www.w3.org/2000/svg}use" and 'data-text' in first_child.attrib: | |
# get all use tags that contains data-text attribute in g tag and print them | |
for u in g.findall('{http://www.w3.org/2000/svg}use'): | |
if 'data-text' in u.attrib: | |
text_vertical = transform_to_array(u.get('transform'))[5] | |
text_horizon = transform_to_array(u.get('transform'))[4] | |
if vertical is None or abs(text_vertical - vertical) > 10: | |
vertical = text_vertical | |
cache = cache.strip() | |
if cache != "": | |
blocks.append(cache) | |
cache = u.get('data-text') | |
block_id += 1 | |
else: | |
# horizon should change | |
if horizon is None or abs(text_horizon - horizon) > 1: | |
horizon = text_horizon | |
cache += u.get('data-text') | |
continue | |
clip_path = g.get('clip-path') | |
if clip_path and '#clip_' in clip_path: | |
clip_id = clip_path.split("#")[1].replace(')', '') | |
if clip_id in clips: | |
path = clips[clip_id].find('.//{http://www.w3.org/2000/svg}path') | |
transform = path.get('transform') | |
if not transform: | |
continue | |
transform = transform.replace('matrix(', '').replace(')', '') | |
d = path.get('d') | |
trans_height = int(float(transform.split(',')[5])) | |
if not (page_size in d or (transform and trans_height == height)): | |
# print(page_size in d) | |
# print(transform and trans_height == height) | |
# print(f"From Transform: {transform}, D: {d}", page_size, trans_height, height) | |
# print(f"From Transform: {transform}, D: {d} in page {page_id}") | |
img_clips.append((transform.split(','), d, page_id, block_id)) | |
blocks.append(f'image_{block_id}') | |
block_id += 1 | |
else: | |
for sub_g in g.findall('.//{http://www.w3.org/2000/svg}g'): | |
sub_clip_path = sub_g.get('clip-path') | |
if sub_clip_path and '#clip_' in sub_clip_path: | |
sub_clip_id = sub_clip_path.split("#")[1].replace(')', '') | |
if sub_clip_id in clips: | |
sub_path = clips[sub_clip_id].find('.//{http://www.w3.org/2000/svg}path') | |
sub_d = sub_path.get('d') | |
sub_transform = sub_path.get('transform') | |
sub_transform = sub_transform.replace('matrix(', '').replace(')', '') | |
subtrans_height = int(float(sub_transform.split(',')[5])) | |
if not (page_size in sub_d or (sub_transform and subtrans_height == height)): | |
# print(f"From sub Transform: |{sub_transform}|, D: {sub_d} in page {page_id}") | |
img_clips.append((sub_transform.split(','), sub_d, page_id, block_id)) | |
blocks.append(f'image_{block_id}') | |
block_id += 1 | |
break | |
return img_clips, blocks | |
def get_svg_text_around_image(blocks, block_id, lang='CN', word_count=50): | |
text_content = "" | |
counter = word_count | |
# Process lines before the image | |
for line in reversed(blocks[:block_id]): | |
text_content = line + '\n' + text_content | |
if lang == 'CN': | |
counter -= len(line) | |
else: | |
counter -= len(line.split(' ')) | |
if counter <= 0: | |
break | |
# Reset the word counter for lines after the image | |
counter = word_count | |
# Process lines after the image | |
for line in blocks[block_id+1:]: | |
text_content += line + '\n' | |
if lang == 'CN': | |
counter -= len(line) | |
else: | |
counter -= len(line.split(' ')) | |
if counter <= 0: | |
break | |
return text_content.strip() | |
def get_svg_title_around_image(blocks, block_id, lang='CN'): | |
# Search for a title in the lines before the image | |
title = None | |
for line in reversed(blocks[:block_id]): | |
if lang == 'CN' and '图' in line: | |
title = f"title: {line}" | |
break | |
elif 'figure' in line.lower(): | |
title = f"title: {line}" | |
break | |
# Search for a title in the lines after the image | |
for line in blocks[block_id+1:]: | |
if lang == 'CN' and '图 ' in line: | |
return f"title: {line}" | |
elif lang == 'CN' and '图' in line: | |
return f"title: {line}" | |
elif 'figure' in line.lower(): | |
return f"title: {line}" | |
return title if title else "title: Not Found" | |