post-processing-summarization / custom_renderer.py
MatthiasC's picture
Add dependency comp general functionality, fix issues and add more examples
357d42c
raw
history blame
10.4 kB
from typing import Dict, Any
import numpy as np
import spacy
from PIL import ImageFont
from spacy.tokens import Doc
def get_pil_text_size(text, font_size, font_name):
font = ImageFont.truetype(font_name, font_size)
size = font.getsize(text)
return size
def render_arrow(
label: str, start: int, end: int, direction: str, i: int
) -> str:
"""Render individual arrow.
label (str): Dependency label.
start (int): Index of start word.
end (int): Index of end word.
direction (str): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index.
RETURNS (str): Rendered SVG markup.
"""
TPL_DEP_ARCS = """
<g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="red"/>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" side="{label_side}" fill="red" text-anchor="middle">{label}</textPath>
</text>
<path class="displacy-arrowhead" d="{head}" fill="red"/>
</g>
"""
arc = get_arc(start + 10, 50, 5, end + 10)
arrowhead = get_arrowhead(direction, start + 10, 50, end + 10)
label_side = "right" if direction == "rtl" else "left"
return TPL_DEP_ARCS.format(
id=0,
i=0,
stroke=2,
head=arrowhead,
label=label,
label_side=label_side,
arc=arc,
)
def get_arc(x_start: int, y: int, y_curve: int, x_end: int) -> str:
"""Render individual arc.
x_start (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point.
RETURNS (str): Definition of the arc path ('d' attribute).
"""
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
return template.format(x=x_start, y=y, c=y_curve, e=x_end)
def get_arrowhead(direction: str, x: int, y: int, end: int) -> str:
"""Render individual arrow head.
direction (str): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point.
RETURNS (str): Definition of the arrow head path ('d' attribute).
"""
arrow_width = 6
if direction == "left":
p1, p2, p3 = (x, x - arrow_width + 2, x + arrow_width - 2)
else:
p1, p2, p3 = (end, end + arrow_width - 2, end - arrow_width + 2)
return f"M{p1},{y + 2} L{p2},{y - arrow_width} {p3},{y - arrow_width}"
# parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
def render_sentence_custom(unmatched_list: Dict):
TPL_DEP_WORDS = """
<text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
</text>
"""
TPL_DEP_SVG = """
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
"""
arcs_svg = []
nlp = spacy.load('en_core_web_lg')
doc = nlp(unmatched_list["sentence"])
# words = {}
# unmatched_list = [parse_deps(doc)]
# #print(parsed)
# for i, p in enumerate(unmatched_list):
# arcs = p["arcs"]
# words = p["words"]
# for i, a in enumerate(arcs):
# #CHECK CERTAIN DEPS (ALSO ADD/CHANGE BELOW WHEN CHANGING HERE)
# if a["label"] == "amod":
# couples = (a["start"], a["end"])
# elif a["label"] == "pobj":
# couples = (a["start"], a["end"])
# #couples = (3,5)
#
# x_value_counter = 10
# index_counter = 0
# svg_words = []
# coords_test = []
# for i, word in enumerate(words):
# word = word["text"]
# word = word + " "
# pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
# svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
# if index_counter >= couples[0] and index_counter <= couples[1]:
# coords_test.append(x_value_counter)
# x_value_counter += 50
# index_counter += 1
# x_value_counter += pixel_x_length + 4
# for i, a in enumerate(arcs):
# if a["label"] == "amod":
# arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
# elif a["label"] == "pobj":
# arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
#
# content = "".join(svg_words) + "".join(arcs_svg)
#
# full_svg = TPL_DEP_SVG.format(
# id=0,
# width=1200, #600
# height=250, #125
# color="#00000",
# bg="#ffffff",
# font="Arial",
# content=content,
# dir="ltr",
# lang="en",
# )
x_value_counter = 10
index_counter = 0
svg_words = []
words = unmatched_list["sentence"].split(" ")
coords_test = []
#print(unmatched_list)
#print(words)
#print("NOW")
direction_current = "rtl"
if unmatched_list["cur_word_index"] < unmatched_list["target_word_index"]:
min_index = unmatched_list["cur_word_index"]
max_index = unmatched_list["target_word_index"]
direction_current = "left"
else:
max_index = unmatched_list["cur_word_index"]
min_index = unmatched_list["target_word_index"]
for i, token in enumerate(doc):
word = str(token)
word = word + " "
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
if min_index <= index_counter <= max_index:
coords_test.append(x_value_counter)
if index_counter < max_index - 1:
x_value_counter += 50
index_counter += 1
x_value_counter += pixel_x_length + 4
# TODO: DYNAMIC DIRECTION MAKING (SHOULD GIVE WITH DICT I THINK)
#print(coords_test)
arcs_svg.append(render_arrow(unmatched_list['dep'], coords_test[0], coords_test[-1], direction_current, i))
content = "".join(svg_words) + "".join(arcs_svg)
full_svg = TPL_DEP_SVG.format(
id=0,
width=1200, # 600
height=75, # 125
color="#00000",
bg="#ffffff",
font="Arial",
content=content,
dir="ltr",
lang="en",
)
return full_svg
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
if not doc.has_annotation("DEP"):
print("WARNING")
if options.get("collapse_phrases", False):
with doc.retokenize() as retokenizer:
for np in list(doc.noun_chunks):
attrs = {
"tag": np.root.tag_,
"lemma": np.root.lemma_,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
if options.get("collapse_punct", True):
spans = []
for word in doc[:-1]:
if word.is_punct or not word.nbor(1).is_punct:
continue
start = word.i
end = word.i + 1
while end < len(doc) and doc[end].is_punct:
end += 1
span = doc[start:end]
spans.append((span, word.tag_, word.lemma_, word.ent_type_))
with doc.retokenize() as retokenizer:
for span, tag, lemma, ent_type in spans:
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
retokenizer.merge(span, attrs=attrs)
fine_grained = options.get("fine_grained")
add_lemma = options.get("add_lemma")
words = [
{
"text": w.text,
"tag": w.tag_ if fine_grained else w.pos_,
"lemma": w.lemma_ if add_lemma else None,
}
for w in doc
]
arcs = []
for word in doc:
if word.i < word.head.i:
arcs.append(
{"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
)
elif word.i > word.head.i:
arcs.append(
{
"start": word.head.i,
"end": word.i,
"label": word.dep_,
"dir": "right",
}
)
return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
def get_doc_settings(doc: Doc) -> Dict[str, Any]:
return {
"lang": doc.lang_,
"direction": doc.vocab.writing_system.get("direction", "ltr"),
}