Spaces:
Runtime error
Runtime error
| <!-- HTML PRE CHUNK: | |
| This performs a best-effort preliminary "chunking" of text in an HTML file, | |
| matching each chunk with a "headers" metadata value based on header tags in proximity. | |
| recursively visits every element (template mode=list). | |
| for every element with tagname of interest (only): | |
| 1. serializes a div (and metadata marking the element's xpath). | |
| 2. calculates all text-content for the given element, including descendant elements which are *not* themselves tags of interest. | |
| 3. if any such text-content was found, serializes a "headers" (span.headers) along with this text (span.chunk). | |
| to calculate the "headers" of an element: | |
| 1. recursively gets the *nearest* prior-siblings for headings of *each* level | |
| 2. recursively repeats that step#1 for each ancestor (regardless of tag) | |
| n.b. this recursion is only performed (beginning with) elements which are | |
| both (1) tags-of-interest and (2) have their own text-content. | |
| --> | |
| <xsl:stylesheet version="1.0" | |
| xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
| xmlns="http://www.w3.org/1999/xhtml"> | |
| <xsl:param name="tags">div|p|blockquote|ol|ul</xsl:param> | |
| <xsl:template match="/"> | |
| <html> | |
| <head> | |
| <style> | |
| div { | |
| border: solid; | |
| margin-top: .5em; | |
| padding-left: .5em; | |
| } | |
| h1, h2, h3, h4, h5, h6 { | |
| margin: 0; | |
| } | |
| .xpath { | |
| color: blue; | |
| } | |
| .chunk { | |
| margin: .5em 1em; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- create "filtered tree" with only tags of interest --> | |
| <xsl:apply-templates select="*" /> | |
| </body> | |
| </html> | |
| </xsl:template> | |
| <xsl:template match="*"> | |
| <xsl:choose> | |
| <!-- tags of interest get serialized into the filtered tree (and recurse down child elements) --> | |
| <xsl:when test="contains( | |
| concat('|', $tags, '|'), | |
| concat('|', local-name(), '|'))"> | |
| <xsl:variable name="xpath"> | |
| <xsl:apply-templates mode="xpath" select="." /> | |
| </xsl:variable> | |
| <xsl:variable name="txt"> | |
| <!-- recurse down child text-nodes and elements --> | |
| <xsl:apply-templates mode="text" /> | |
| </xsl:variable> | |
| <xsl:variable name="txt-norm" select="normalize-space($txt)" /> | |
| <div title="{$xpath}"> | |
| <small class="xpath"> | |
| <xsl:value-of select="$xpath" /> | |
| </small> | |
| <xsl:if test="$txt-norm"> | |
| <xsl:variable name="headers"> | |
| <xsl:apply-templates mode="headingsWithAncestors" select="." /> | |
| </xsl:variable> | |
| <xsl:if test="normalize-space($headers)"> | |
| <span class="headers"> | |
| <xsl:copy-of select="$headers" /> | |
| </span> | |
| </xsl:if> | |
| <p class="chunk"> | |
| <xsl:value-of select="$txt-norm" /> | |
| </p> | |
| </xsl:if> | |
| <xsl:apply-templates select="*" /> | |
| </div> | |
| </xsl:when> | |
| <!-- all other tags get "skipped" and recurse down child elements --> | |
| <xsl:otherwise> | |
| <xsl:apply-templates select="*" /> | |
| </xsl:otherwise> | |
| </xsl:choose> | |
| </xsl:template> | |
| <!-- text mode: | |
| prints text nodes; | |
| for elements, recurses down child nodes (text and elements) *except* certain exceptions: | |
| tags of interest (handled in their own list-mode match), | |
| non-content text (e.g. script|style) | |
| --> | |
| <!-- ignore non-content text --> | |
| <xsl:template mode="text" match=" | |
| script|style" /> | |
| <!-- for all other elements *except tags of interest*, recurse on child-nodes (text and elements) --> | |
| <xsl:template mode="text" match="*"> | |
| <xsl:choose> | |
| <!-- ignore tags of interest --> | |
| <xsl:when test="contains( | |
| concat('|', $tags, '|'), | |
| concat('|', local-name(), '|'))" /> | |
| <xsl:otherwise> | |
| <xsl:apply-templates mode="text" /> | |
| </xsl:otherwise> | |
| </xsl:choose> | |
| </xsl:template> | |
| <!-- xpath mode: | |
| return an xpath which matches this element uniquely | |
| --> | |
| <xsl:template mode="xpath" match="*"> | |
| <!-- recurse up parents --> | |
| <xsl:apply-templates mode="xpath" select="parent::*" /> | |
| <xsl:value-of select="name()" /> | |
| <xsl:text>[</xsl:text> | |
| <xsl:value-of select="1+count(preceding-sibling::*)" /> | |
| <xsl:text>]/</xsl:text> | |
| </xsl:template> | |
| <!-- headingsWithAncestors mode: | |
| recurses up parents (ALL ancestors) | |
| --> | |
| <xsl:template mode="headingsWithAncestors" match="*"> | |
| <!-- recurse --> | |
| <xsl:apply-templates mode="headingsWithAncestors" select="parent::*" /> | |
| <xsl:apply-templates mode="headingsWithPriorSiblings" select="."> | |
| <xsl:with-param name="maxHead" select="6" /> | |
| </xsl:apply-templates> | |
| </xsl:template> | |
| <!-- headingsWithPriorSiblings mode: | |
| recurses up preceding-siblings | |
| --> | |
| <xsl:template mode="headingsWithPriorSiblings" match="*"> | |
| <xsl:param name="maxHead" /> | |
| <xsl:variable name="headLevel" select="number(substring(local-name(), 2))" /> | |
| <xsl:choose> | |
| <xsl:when test="'h' = substring(local-name(), 1, 1) and $maxHead >= $headLevel"> | |
| <!-- recurse up to prior sibling; max level one less than current --> | |
| <xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]"> | |
| <xsl:with-param name="maxHead" select="$headLevel - 1" /> | |
| </xsl:apply-templates> | |
| <xsl:apply-templates mode="heading" select="." /> | |
| </xsl:when> | |
| <!-- special case for 'header' tag, serialize child-headers --> | |
| <xsl:when test="self::header"> | |
| <xsl:apply-templates mode="heading" select="h1|h2|h3|h4|h5|h6" /> | |
| <!-- | |
| we choose not to recurse further up prior-siblings in this case, | |
| but n.b. the 'headingsWithAncestors' template above will still continue recursion. | |
| --> | |
| </xsl:when> | |
| <xsl:otherwise> | |
| <!-- recurse up to prior sibling; no other work on this element --> | |
| <xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]"> | |
| <xsl:with-param name="maxHead" select="$maxHead" /> | |
| </xsl:apply-templates> | |
| </xsl:otherwise> | |
| </xsl:choose> | |
| </xsl:template> | |
| <xsl:template mode="heading" match="h1|h2|h3|h4|h5|h6"> | |
| <xsl:copy> | |
| <xsl:value-of select="normalize-space(.)" /> | |
| </xsl:copy> | |
| </xsl:template> | |
| </xsl:stylesheet> | |