Spaces:
Running
Running
| export class DoclingConverter { | |
| constructor() { | |
| this.simpleTagMap = { | |
| doctag: "div", | |
| document: "div", | |
| ordered_list: "ol", | |
| unordered_list: "ul", | |
| list_item: "li", | |
| caption: "figcaption", | |
| footnote: "sup", | |
| formula: "div", | |
| page_footer: "footer", | |
| page_header: "header", | |
| picture: "figure", | |
| chart: "figure", | |
| table: "table", | |
| otsl: "table", | |
| text: "p", | |
| paragraph: "p", | |
| title: "h1", | |
| document_index: "div", | |
| form: "form", | |
| key_value_region: "dl", | |
| reference: "a", | |
| smiles: "span", | |
| }; | |
| this.selfClosingTagMap = { | |
| checkbox_selected: '<input type="checkbox" checked disabled>', | |
| checkbox_unselected: '<input type="checkbox" disabled>', | |
| page_break: '<hr class="page-break">', | |
| }; | |
| this.TABLE_TAG_CONFIG = { | |
| "<ched>": { htmlTag: "th" }, | |
| "<rhed>": { htmlTag: "th", scope: "row" }, | |
| "<srow>": { htmlTag: "th", scope: "row" }, | |
| "<fcel>": { htmlTag: "td" }, | |
| "<ecel>": { htmlTag: "td" }, | |
| "<ucel>": { htmlTag: "td" }, | |
| "<lcel>": { htmlTag: "td" }, | |
| "<xcel>": { htmlTag: "td" }, | |
| }; | |
| this.TABLE_TAG_REGEX = new RegExp(`(${Object.keys(this.TABLE_TAG_CONFIG).join("|")})`); | |
| const selfClosingNames = Object.keys(this.selfClosingTagMap).join("|"); | |
| this.combinedTagRegex = new RegExp(`(<([a-z_0-9]+)>(.*?)<\\/\\2>)|(<(${selfClosingNames})>)`, "s"); | |
| } | |
| escapeHtml(text) { | |
| if (!text) return ""; | |
| return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """); | |
| } | |
| convert(docling) { | |
| let html = ` ${docling} `; | |
| html = this.cleanupMetadataTokens(html); | |
| html = this.processTags(html); | |
| return html.trim(); | |
| } | |
| processTags(text) { | |
| let remainingText = text; | |
| let result = ""; | |
| while (remainingText.length > 0) { | |
| const match = remainingText.match(this.combinedTagRegex); | |
| if (match && typeof match.index === "number") { | |
| const textBefore = remainingText.substring(0, match.index); | |
| result += this.escapeHtml(textBefore); | |
| const fullMatch = match[0]; | |
| const pairedTagName = match[2]; | |
| const pairedContent = match[3]; | |
| const selfClosingTagName = match[5]; | |
| if (pairedTagName !== undefined) { | |
| result += this.convertSingleTag(pairedTagName, pairedContent); | |
| } else if (selfClosingTagName !== undefined) { | |
| result += this.selfClosingTagMap[selfClosingTagName] || ""; | |
| } | |
| remainingText = remainingText.substring(match.index + fullMatch.length); | |
| } else { | |
| result += this.escapeHtml(remainingText); | |
| break; | |
| } | |
| } | |
| return result; | |
| } | |
| convertSingleTag(tagName, content) { | |
| if (tagName === "list_item") { | |
| content = content.trim().replace(/^[·-]\s*/g, ""); | |
| } | |
| switch (tagName) { | |
| case "code": | |
| return this.convertBlockCode(content); | |
| case "otsl": | |
| return this.convertTable(content); | |
| case "picture": | |
| case "chart": | |
| return this.convertPictureOrChart(tagName, content); | |
| case "inline": | |
| return this.convertInlineContent(content); | |
| case "section_header_level_0": | |
| case "section_header_level_1": | |
| case "section_header_level_2": | |
| case "section_header_level_3": | |
| case "section_header_level_4": | |
| case "section_header_level_5": | |
| const level = parseInt(tagName.at(-1), 10) + 1; | |
| return `<h${level}>${this.processTags(content)}</h${level}>`; | |
| default: | |
| const htmlTag = this.simpleTagMap[tagName]; | |
| if (htmlTag) { | |
| const processedContent = this.processTags(content); | |
| const startTag = this.getStartTag(tagName, htmlTag); | |
| return `${startTag}${processedContent}</${htmlTag}>`; | |
| } | |
| console.warn(`Unknown tag encountered: ${tagName}, escaping it.`); | |
| return this.escapeHtml(`<${tagName}>${content}</${tagName}>`); | |
| } | |
| } | |
| getStartTag(doclingTag, htmlTag) { | |
| switch (doclingTag) { | |
| case "doctag": | |
| case "document": | |
| return '<div class="docling-document">'; | |
| case "formula": | |
| return '<div class="formula">'; | |
| case "document_index": | |
| return '<div class="toc">'; | |
| case "smiles": | |
| return '<span class="smiles">'; | |
| case "reference": | |
| return '<a href="#">'; | |
| default: | |
| return `<${htmlTag}>`; | |
| } | |
| } | |
| convertInlineContent(content) { | |
| const inlineTagRegex = /<(code|formula|text|smiles)>(.*?)<\/\1>/s; | |
| let remainingText = content; | |
| let result = ""; | |
| while (remainingText.length > 0) { | |
| const match = remainingText.match(inlineTagRegex); | |
| if (match && typeof match.index === "number") { | |
| const textBefore = remainingText.substring(0, match.index); | |
| result += this.escapeHtml(textBefore); | |
| const [fullMatch, tagName, innerContent] = match; | |
| switch (tagName) { | |
| case "code": | |
| const langRegex = /<_(.*?)_>/; | |
| const langMatch = innerContent.match(langRegex); | |
| if (langMatch && langMatch[1]) { | |
| const language = this.sanitizeLanguageName(langMatch[1]); | |
| const codeContent = innerContent.replace(langRegex, "").trim(); | |
| const escapedCode = this.escapeHtml(codeContent); | |
| const langClass = language !== "unknown" ? ` class="language-${language}"` : ""; | |
| result += `<code${langClass}>${escapedCode}</code>`; | |
| } else { | |
| result += `<code>${this.escapeHtml(innerContent)}</code>`; | |
| } | |
| break; | |
| case "formula": | |
| result += `<span class="formula">${this.escapeHtml(innerContent)}</span>`; | |
| break; | |
| case "smiles": | |
| result += `<span class="smiles">${this.escapeHtml(innerContent)}</span>`; | |
| break; | |
| case "text": | |
| result += this.escapeHtml(innerContent); | |
| break; | |
| } | |
| remainingText = remainingText.substring(match.index + fullMatch.length); | |
| } else { | |
| result += this.escapeHtml(remainingText); | |
| break; | |
| } | |
| } | |
| return result; | |
| } | |
| convertBlockCode(content) { | |
| const langRegex = /<_(.*?)_>/; | |
| const langMatch = content.match(langRegex); | |
| let language = "unknown"; | |
| let codeContent = content; | |
| if (langMatch && langMatch[1]) { | |
| language = this.sanitizeLanguageName(langMatch[1]); | |
| codeContent = content.replace(langRegex, "").trim(); | |
| } | |
| const escapedCode = this.escapeHtml(codeContent); | |
| const langClass = language !== "unknown" ? ` class="language-${language}"` : ""; | |
| return `<pre><code${langClass}>${escapedCode}</code></pre>`; | |
| } | |
| convertTable(content) { | |
| const rows = content | |
| .trim() | |
| .split(/<nl>/) | |
| .filter((row) => row.length > 0); | |
| const cellGrid = []; | |
| rows.forEach((rowStr, rowIndex) => { | |
| var _a; | |
| const parts = rowStr.split(this.TABLE_TAG_REGEX); | |
| const currentRow = []; | |
| let gridColIndex = 0; | |
| for (let i = 1; i < parts.length; i += 2) { | |
| const tag = parts[i]; | |
| const cellContent = parts[i + 1] || ""; | |
| switch (tag) { | |
| case "<lcel>": | |
| if (currentRow.length > 0) { | |
| currentRow[currentRow.length - 1].colspan++; | |
| } | |
| break; | |
| case "<ucel>": | |
| if (rowIndex > 0 && ((_a = cellGrid[rowIndex - 1]) === null || _a === void 0 ? void 0 : _a[gridColIndex])) { | |
| cellGrid[rowIndex - 1][gridColIndex].rowspan++; | |
| } | |
| gridColIndex++; | |
| break; | |
| case "<xcel>": | |
| if (currentRow.length > 0) { | |
| currentRow[currentRow.length - 1].colspan++; | |
| } | |
| break; | |
| default: | |
| if (this.TABLE_TAG_CONFIG[tag]) { | |
| currentRow.push({ | |
| content: cellContent, | |
| tag, | |
| colspan: 1, | |
| rowspan: 1, | |
| }); | |
| gridColIndex++; | |
| } | |
| break; | |
| } | |
| } | |
| cellGrid.push(currentRow); | |
| }); | |
| const htmlRows = cellGrid | |
| .map((row) => { | |
| const cellsHtml = row | |
| .map((cell) => { | |
| const config = this.TABLE_TAG_CONFIG[cell.tag]; | |
| if (!config) return ""; | |
| const attrs = []; | |
| if (cell.colspan > 1) attrs.push(`colspan="${cell.colspan}"`); | |
| if (cell.rowspan > 1) attrs.push(`rowspan="${cell.rowspan}"`); | |
| if (config.scope) attrs.push(`scope="${config.scope}"`); | |
| const processedContent = this.processTags(cell.content); | |
| const attrString = attrs.length > 0 ? ` ${attrs.join(" ")}` : ""; | |
| return `<${config.htmlTag}${attrString}>${processedContent}</${config.htmlTag}>`; | |
| }) | |
| .join(""); | |
| return `<tr>${cellsHtml}</tr>`; | |
| }) | |
| .join(""); | |
| return `<table><tbody>${htmlRows}</tbody></table>`; | |
| } | |
| convertPictureOrChart(tag, content) { | |
| if (/<(fcel|ched|rhed)>/.test(content)) { | |
| const cleanedContent = content.replace(/<[a-z_]+>/g, (match) => { | |
| if (match.startsWith("<fcel") || match.startsWith("<ched") || match.startsWith("<rhed") || match.startsWith("<nl")) { | |
| return match; | |
| } | |
| return ""; | |
| }); | |
| return this.convertTable(cleanedContent); | |
| } | |
| let captionHtml = ""; | |
| const captionRegex = /<caption>(.*?)<\/caption>/s; | |
| const captionMatch = content.match(captionRegex); | |
| if (captionMatch && captionMatch[1]) { | |
| const captionContent = this.processTags(captionMatch[1]); | |
| captionHtml = `<figcaption>${captionContent}</figcaption>`; | |
| } | |
| const contentWithoutCaption = content.replace(captionRegex, ""); | |
| const classificationRegex = /<([a-z_]+)>/; | |
| const classMatch = contentWithoutCaption.match(classificationRegex); | |
| let altText = tag; | |
| if (classMatch) { | |
| altText = classMatch[1].replace(/_/g, " "); | |
| } | |
| const imgHtml = `<img alt="${this.escapeHtml(altText)}" src="">`; | |
| const figureTag = this.simpleTagMap[tag] || "figure"; | |
| return `<${figureTag}>${imgHtml}${captionHtml}</${figureTag}>`; | |
| } | |
| sanitizeLanguageName(lang) { | |
| const lowerLang = lang.toLowerCase(); | |
| const aliasMap = { | |
| "c#": "csharp", | |
| "c++": "cpp", | |
| objectivec: "objective-c", | |
| visualbasic: "vb", | |
| javascript: "js", | |
| typescript: "ts", | |
| python: "py", | |
| ruby: "rb", | |
| dockerfile: "docker", | |
| }; | |
| return aliasMap[lowerLang] || lowerLang.replace(/[\s#+]/g, "-"); | |
| } | |
| cleanupMetadataTokens(docling) { | |
| return docling.replace(/<loc_[0-9]+>/g, ""); | |
| } | |
| } | |
| export function doclingToHtml(docling) { | |
| const converter = new DoclingConverter(); | |
| const body = converter.convert(docling); | |
| return `<!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.23/dist/katex.min.css" integrity="sha384-//SZkxyB7axjCAopkAL1E1rve+ZSPKapD89Lo/lLhcsXR+zOYl5z6zJZEFXil+q0" crossorigin="anonymous"> | |
| <style> | |
| html { | |
| background-color: #f5f5f5; | |
| font-family: Arial, sans-serif; | |
| line-height: 1.6; | |
| } | |
| header, footer { | |
| text-align: center; | |
| margin-bottom: 1rem; | |
| font-size: 1em; | |
| } | |
| body { | |
| max-width: 800px; | |
| margin: 0 auto; | |
| padding: 2rem; | |
| background-color: white; | |
| box-shadow: 0 0 10px rgba(0,0,0,0.1); | |
| } | |
| h1, h2, h3, h4, h5, h6 { | |
| color: #333; | |
| margin-top: 1.5em; | |
| margin-bottom: 0.5em; | |
| } | |
| h1 { | |
| font-size: 2em; | |
| border-bottom: 1px solid #eee; | |
| padding-bottom: 0.3em; | |
| } | |
| table { | |
| border-collapse: collapse; | |
| margin: 1em 0; | |
| width: 100%; | |
| } | |
| th, td { | |
| border: 1px solid #ddd; | |
| padding: 8px; | |
| text-align: left; | |
| } | |
| th { | |
| background-color: #f2f2f2; | |
| font-weight: bold; | |
| } | |
| figure { | |
| margin: 1.5em 0; | |
| text-align: center; | |
| } | |
| figcaption { | |
| color: #666; | |
| font-style: italic; | |
| margin-top: 0.5em; | |
| } | |
| img { | |
| max-width: 100%; | |
| height: auto; | |
| } | |
| pre { | |
| background-color: #f6f8fa; | |
| border-radius: 3px; | |
| padding: 1em; | |
| overflow: auto; | |
| } | |
| code { | |
| font-family: monospace; | |
| background-color: #f6f8fa; | |
| padding: 0.2em 0.4em; | |
| border-radius: 3px; | |
| } | |
| pre code { | |
| background-color: transparent; | |
| padding: 0; | |
| } | |
| .formula { | |
| text-align: center; | |
| padding: 0.5em; | |
| margin: 1em 0; | |
| } | |
| .formula:not(:has(.katex)) { | |
| color: transparent; | |
| } | |
| .page-break { | |
| page-break-after: always; | |
| border-top: 1px dashed #ccc; | |
| margin: 2em 0; | |
| } | |
| .key-value-region { | |
| background-color: #f9f9f9; | |
| padding: 1em; | |
| border-radius: 4px; | |
| margin: 1em 0; | |
| } | |
| .key-value-region dt { | |
| font-weight: bold; | |
| } | |
| .key-value-region dd { | |
| margin-left: 1em; | |
| margin-bottom: 0.5em; | |
| } | |
| .form-container { | |
| border: 1px solid #ddd; | |
| padding: 1em; | |
| border-radius: 4px; | |
| margin: 1em 0; | |
| } | |
| .form-item { | |
| margin-bottom: 0.5em; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| ${body} | |
| <script type="module"> | |
| import katex from 'https://cdn.jsdelivr.net/npm/katex@0.16.23/dist/katex.mjs'; | |
| import renderMathInElement from "https://cdn.jsdelivr.net/npm/katex@0.16.23/dist/contrib/auto-render.mjs"; | |
| const mathElements = document.querySelectorAll('.formula'); | |
| for (let element of mathElements) { | |
| katex.render(element.textContent, element, { | |
| throwOnError: false, | |
| }); | |
| } | |
| renderMathInElement(document.body, { | |
| delimiters: [ | |
| {left: "$$", right: "$$", display: true}, | |
| {left: "\\\\[", right: "\\\\]", display: true}, | |
| {left: "$", right: "$", display: false}, | |
| {left: "\\\\(", right: "\\\\)", display: false} | |
| ], | |
| throwOnError : false, | |
| }); | |
| </script> | |
| </body> | |
| </html>`; | |
| } | |