ZhouChuYue
fix: tech-style refresh + single-box markdown + auto height output
2bb462d
# -*- coding: utf-8 -*-
"""
UltraData Math Parser - Hugging Face Space Demo
A unified HTML parser optimized for extracting mathematical content.
"""
import gradio as gr
import requests
from ultradata_math_parser import GeneralParser
def fetch_url_content(url: str) -> tuple:
"""Fetch HTML content from a URL."""
if not url or not url.strip():
return "", "Please enter a URL"
url = url.strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
return response.text, url
except requests.exceptions.Timeout:
return "", f"Request timed out for {url}"
except requests.exceptions.RequestException as e:
return "", f"Failed to fetch URL: {str(e)}"
def fetch_and_parse(url: str, process_math: bool, include_tables: bool, enable_forum: bool, html_type: str) -> tuple:
"""Fetch URL content and parse it in one step."""
html_content, base_url = fetch_url_content(url)
if not html_content:
# base_url contains error message in this case
error_msg = base_url
return "", error_msg, f"❌ {error_msg}", "", "", f"**Error:** {error_msg}"
result = parse_html(
html_content=html_content,
base_url=base_url,
process_math=process_math,
include_tables=include_tables,
enable_forum_assembly=enable_forum,
html_type=html_type,
)
formatted = format_output(result)
# Return: html_content, base_url, title, html_output, text_output, markdown_output
return html_content, base_url, formatted[0], formatted[1], formatted[2], formatted[3]
def parse_html(
html_content: str,
base_url: str = "",
process_math: bool = True,
include_tables: bool = True,
enable_forum_assembly: bool = True,
html_type: str = "unified",
) -> dict:
"""
Parse HTML content using GeneralParser.
Args:
html_content: Raw HTML string to parse
base_url: Base URL for resolving relative links
process_math: Whether to process and convert math expressions
include_tables: Whether to preserve table elements
enable_forum_assembly: Whether to enable forum post assembly
html_type: Parser type (unified/article/forum)
Returns:
Dictionary containing parsed results
"""
if not html_content or not html_content.strip():
return {
"title": "",
"html": "",
"text": "",
"text_length": 0,
"xp_num": "",
"fallback_strategy": "",
"forum_assembled": False,
"error": "Please provide HTML content to parse.",
}
parser = GeneralParser()
try:
result = parser.extract(
html=html_content,
base_url=base_url,
process_math=process_math,
include_tables=include_tables,
enable_forum_assembly=enable_forum_assembly,
html_type=html_type,
)
return {
"title": result.get("title", ""),
"html": result.get("html", ""),
"text": result.get("text", ""),
"text_length": result.get("text_length", 0),
"xp_num": result.get("xp_num", ""),
"fallback_strategy": result.get("fallback_strategy", ""),
"forum_assembled": result.get("forum_assembled", False),
"error": None,
}
except Exception as e:
return {
"title": "",
"html": "",
"text": "",
"text_length": 0,
"xp_num": "",
"fallback_strategy": "",
"forum_assembled": False,
"error": str(e),
}
def format_output(result: dict) -> tuple:
"""Format the parser output for Gradio display."""
if result.get("error"):
return (
f"❌ Error: {result['error']}",
"",
"",
f"**Error:** {result['error']}",
)
# Format text as markdown (wrap in code block for better display)
text_content = result.get("text", "")
markdown_content = text_content if text_content else "_No content extracted_"
return (
result.get("title", ""),
result.get("html", ""),
result.get("text", ""),
markdown_content,
)
def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type):
"""Main processing function for Gradio interface."""
result = parse_html(
html_content=html_content,
base_url=base_url,
process_math=process_math,
include_tables=include_tables,
enable_forum_assembly=enable_forum,
html_type=html_type,
)
return format_output(result)
# Example HTML content for demo
EXAMPLE_HTML = """<!DOCTYPE html>
<html>
<head>
<title>Quadratic Formula Example</title>
</head>
<body>
<article class="post-content">
<h1>Understanding the Quadratic Formula</h1>
<p>The quadratic formula is used to solve equations of the form axΒ² + bx + c = 0.</p>
<p>The solution is given by:</p>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow>
<mo>-</mo>
<mi>b</mi>
<mo>Β±</mo>
<msqrt>
<mrow>
<msup><mi>b</mi><mn>2</mn></msup>
<mo>-</mo>
<mn>4</mn>
<mi>a</mi>
<mi>c</mi>
</mrow>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mi>a</mi>
</mrow>
</mfrac>
</math>
<p>Where a, b, and c are coefficients of the quadratic equation.</p>
<h2>Example Problem</h2>
<p>Solve: xΒ² - 5x + 6 = 0</p>
<p>Here, a = 1, b = -5, c = 6</p>
<p>Using the formula: x = (5 ± √(25-24))/2 = (5 ± 1)/2</p>
<p>Therefore, x = 3 or x = 2</p>
</article>
<footer>
<nav>Related articles...</nav>
</footer>
</body>
</html>"""
# Custom CSS for better aesthetics - Matched with L3 Generator
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
:root {
--primary-color: #6366f1;
--text-light: #f8fafc;
--text-gray: #cbd5e1;
--panel-bg: rgba(15, 23, 42, 0.6);
}
body {
background-color: #0f172a;
color: var(--text-light);
}
.gradio-container {
font-family: 'Inter', sans-serif !important;
background: radial-gradient(circle at top left, #1e1b4b, #0f172a) !important;
max-width: 95% !important; /* Increased width for better split view */
}
/* Force all text to be light by default to combat Gradio's light theme defaults */
.gradio-container * {
color: var(--text-light);
}
/* ... (keep existing styles) ... */
/* Output Box Styling - Fixed Height */
.output-textbox, .markdown-box {
height: 600px !important;
max-height: 600px !important;
overflow-y: auto !important;
background-color: #1e293b !important;
border: 1px solid #64748b !important;
border-radius: 8px !important;
padding: 1rem !important;
}
.output-textbox textarea {
background-color: transparent !important;
border: none !important;
box-shadow: none !important;
height: 100% !important;
color: #ffffff !important;
}
.markdown-box {
background-color: #f8fafc !important; /* Light background for markdown readability */
color: #0f172a !important; /* Dark text for markdown */
}
.markdown-box * {
color: #0f172a !important;
}
.main-title {
font-family: 'Inter', sans-serif !important;
font-weight: 800 !important;
font-size: 3rem !important;
background: linear-gradient(to right, #818cf8, #c084fc, #f472b6) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
text-align: center !important;
margin-bottom: 0.5rem !important;
/* Reset color for gradient text */
color: transparent !important;
}
.subtitle {
text-align: center !important;
color: var(--text-gray) !important;
font-size: 1.1rem !important;
margin-bottom: 3rem !important;
font-weight: 300 !important;
}
/* Panels */
.glass-panel {
background: var(--panel-bg) !important;
backdrop-filter: blur(12px) !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
border-radius: 16px !important;
padding: 24px !important;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
}
/* Labels - High Contrast, Clean Black */
.block > label > span,
.form > label > span,
.gr-form > label > span,
.label-wrap > span {
color: #000000 !important; /* Pure Black */
font-weight: 700 !important;
font-size: 1rem !important;
margin-bottom: 0.5rem !important;
text-shadow: 0 0 2px #ffffff, 0 0 4px #ffffff !important; /* White glow/outline for visibility */
background-color: transparent !important;
padding: 0 !important;
}
/* Info Text (Description) */
span.description, .description {
color: #000000 !important;
font-weight: 600 !important;
text-shadow: 0 0 2px #ffffff !important;
opacity: 1 !important;
}
/* Fix Radio/Checkbox alignment & styling */
fieldset label span {
margin-bottom: 0 !important;
text-shadow: none !important;
font-weight: 600 !important;
color: #0f172a !important; /* Dark text for unselected options (white background) */
display: flex !important;
align-items: center !important;
}
/* Selected radio button text should be white */
fieldset label.selected span {
color: #ffffff !important;
}
/* Radio group title (e.g., Difficulty Level) */
fieldset legend, fieldset legend span,
.gr-radio > label, .gr-radio > label span,
.gradio-container .label-wrap, .gradio-container .label-wrap span {
color: #000000 !important;
font-weight: 700 !important;
text-shadow: 0 0 2px #ffffff, 0 0 4px #ffffff !important;
}
/* Inputs & Textareas - Dark Grey Background for Contrast */
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown {
font-family: 'JetBrains Mono', monospace !important;
background-color: #1e293b !important; /* Slate 800 - Lighter than bg, darker than text */
border: 1px solid #64748b !important; /* Visible Slate Border */
color: #ffffff !important;
box-shadow: none !important;
}
/* Focus state */
.gr-input:focus, textarea:focus, input:focus {
border-color: #ffffff !important; /* White border on focus */
background-color: #334155 !important; /* Slightly lighter on focus */
}
/* Override default block backgrounds */
.gradio-container .block, .gradio-container .panel {
background-color: transparent !important;
border: none !important;
}
/* Fix for dropdown options background */
ul.options, .gr-dropdown-options {
background-color: #1e293b !important;
color: #ffffff !important;
border: 1px solid #64748b !important;
}
/* Markdown prose */
.prose, .prose p, .prose h1, .prose h2, .prose h3, .prose strong, .prose li {
color: var(--text-light) !important;
}
/* Buttons */
.gr-button-primary {
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
border-radius: 8px !important;
transition: transform 0.2s, box-shadow 0.2s !important;
}
.gr-button-primary:hover {
transform: translateY(-1px) !important;
box-shadow: 0 10px 15px -3px rgba(99, 102, 241, 0.3) !important;
}
.gr-button-secondary {
background: rgba(30, 41, 59, 0.8) !important;
border: 1px solid rgba(148, 163, 184, 0.3) !important;
color: var(--text-gray) !important;
border-radius: 8px !important;
}
.gr-button-secondary:hover {
background: rgba(51, 65, 85, 0.8) !important;
color: var(--text-light) !important;
}
/* Tabs */
.tabs {
border: none !important;
background: transparent !important;
margin-bottom: 2rem !important;
}
.tab-nav {
border-bottom: 1px solid rgba(148, 163, 184, 0.2) !important;
justify-content: center !important;
}
.tab-nav button {
font-weight: 600 !important;
font-size: 1rem !important;
color: var(--text-gray) !important;
transition: all 0.3s ease !important;
}
.tab-nav button.selected {
color: #818cf8 !important;
border-bottom: 2px solid #818cf8 !important;
}
/* Section Header */
.section-header {
color: #818cf8 !important;
font-weight: 700 !important;
font-size: 1.5rem !important;
margin-bottom: 1.5rem !important;
padding-bottom: 0.5rem !important;
border-bottom: 2px solid rgba(129, 140, 248, 0.3) !important;
text-shadow: 0 2px 4px rgba(0,0,0,0.3);
}
/* Footer */
.footer-text {
text-align: center;
margin-top: 4rem;
padding: 2rem;
color: var(--text-gray);
font-size: 0.9rem;
border-top: 1px solid rgba(148, 163, 184, 0.1);
}
.footer-text a {
color: #818cf8 !important;
}
/* Accordion */
.gr-accordion {
background-color: rgba(30, 41, 59, 0.4) !important;
border: 1px solid rgba(148, 163, 184, 0.2) !important;
}
/* ===== Tech Refresh (Cleaner + Single Box Output) ===== */
:root {
--bg: #0b1020;
--surface: #0f172a;
--surface-2: #111827;
--border: #1f2937;
--text: #e5e7eb;
--muted: #94a3b8;
--accent: #38bdf8;
--accent-2: #8b5cf6;
}
body {
background-color: var(--bg) !important;
color: var(--text) !important;
}
.gradio-container {
background:
radial-gradient(circle at 15% 10%, rgba(56, 189, 248, 0.08), transparent 40%),
radial-gradient(circle at 85% 20%, rgba(139, 92, 246, 0.08), transparent 45%),
linear-gradient(180deg, #0b1020 0%, #0b1020 100%) !important;
background-size: auto, auto, auto !important;
}
.main-title {
background: linear-gradient(90deg, #e5e7eb, #c7d2fe, #38bdf8) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
}
.subtitle {
color: var(--muted) !important;
}
.glass-panel {
background: linear-gradient(180deg, rgba(15, 23, 42, 0.95), rgba(17, 24, 39, 0.95)) !important;
border: 1px solid rgba(56, 189, 248, 0.15) !important;
box-shadow:
0 0 0 1px rgba(139, 92, 246, 0.12),
0 20px 40px rgba(2, 6, 23, 0.55) !important;
}
.section-header {
color: var(--text) !important;
border-bottom: 1px solid rgba(56, 189, 248, 0.2) !important;
text-shadow: none !important;
}
/* Labels & descriptions */
.block > label > span,
.form > label > span,
.gr-form > label > span,
.label-wrap > span {
color: var(--text) !important;
text-shadow: none !important;
}
span.description, .description {
color: var(--muted) !important;
text-shadow: none !important;
}
/* Inputs */
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown {
background-color: var(--surface-2) !important;
border: 1px solid var(--border) !important;
color: var(--text) !important;
}
.gr-input:focus, textarea:focus, input:focus {
border-color: var(--accent) !important;
box-shadow: 0 0 0 2px rgba(56, 189, 248, 0.15) !important;
}
/* Buttons */
.gr-button-primary {
background: linear-gradient(135deg, #2563eb 0%, #7c3aed 100%) !important;
box-shadow: 0 8px 20px rgba(37, 99, 235, 0.2) !important;
}
.gr-button-primary:hover {
background: linear-gradient(135deg, #1d4ed8 0%, #6d28d9 100%) !important;
}
.gr-button-secondary {
background: transparent !important;
border: 1px solid rgba(148, 163, 184, 0.35) !important;
color: var(--text) !important;
}
/* Tabs */
.tab-nav button {
color: var(--muted) !important;
}
.tab-nav button.selected {
color: var(--text) !important;
border-bottom: 2px solid var(--accent) !important;
}
/* Output: single box + auto height */
.output-textbox {
background-color: var(--surface-2) !important;
border: 1px solid var(--border) !important;
border-radius: 12px !important;
padding: 12px !important;
min-height: 220px !important;
max-height: 560px !important;
height: auto !important;
overflow-y: auto !important;
}
.output-textbox textarea {
background-color: transparent !important;
border: none !important;
box-shadow: none !important;
color: var(--text) !important;
min-height: 200px !important;
max-height: 520px !important;
height: auto !important;
overflow-y: auto !important;
}
.markdown-box {
background: transparent !important;
border: none !important;
padding: 0 !important;
}
.markdown-box .prose {
background-color: var(--surface-2) !important;
border: 1px solid var(--border) !important;
border-radius: 12px !important;
padding: 16px !important;
min-height: 220px !important;
max-height: 560px !important;
overflow-y: auto !important;
}
.markdown-box, .markdown-box * {
color: var(--text) !important;
}
.markdown-box code, .markdown-box pre {
background: #1f2937 !important;
}
/* === Premium Clean Overrides === */
:root {
--bg: #0b1120;
--surface: #111827;
--surface-2: #0f172a;
--border: #1f2937;
--text: #e5e7eb;
--muted: #94a3b8;
--accent: #6366f1;
}
body {
background-color: var(--bg) !important;
color: var(--text) !important;
}
.gradio-container {
background: var(--bg) !important;
width: 95vw !important;
max-width: 1400px !important;
margin: 0 auto !important;
}
.main-title {
background: none !important;
-webkit-text-fill-color: unset !important;
color: var(--text) !important;
font-size: 2.4rem !important;
letter-spacing: -0.01em !important;
}
.subtitle {
color: var(--muted) !important;
margin-bottom: 2rem !important;
}
.glass-panel {
background: linear-gradient(180deg, rgba(17, 24, 39, 0.95), rgba(15, 23, 42, 0.95)) !important;
border: 1px solid var(--border) !important;
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.35) !important;
}
.section-header {
color: var(--text) !important;
border-bottom: 1px solid var(--border) !important;
text-shadow: none !important;
}
/* Labels & descriptions */
.block > label > span,
.form > label > span,
.gr-form > label > span,
.label-wrap > span,
span.description,
.description {
color: var(--muted) !important;
text-shadow: none !important;
}
/* Inputs */
.gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown {
background-color: var(--surface-2) !important;
border: 1px solid var(--border) !important;
color: var(--text) !important;
}
.gr-input:focus, textarea:focus, input:focus {
border-color: var(--accent) !important;
background-color: #111827 !important;
}
/* Buttons */
.gr-button-primary {
background: var(--accent) !important;
box-shadow: none !important;
color: #ffffff !important;
}
.gr-button-primary:hover {
background: #4f46e5 !important;
}
.gr-button-secondary {
background: transparent !important;
border: 1px solid var(--border) !important;
color: var(--text) !important;
}
.gr-button-secondary:hover {
background: rgba(31, 41, 55, 0.6) !important;
}
/* Tabs */
.tab-nav button {
color: var(--muted) !important;
}
.tab-nav button.selected {
color: var(--text) !important;
border-bottom: 2px solid var(--accent) !important;
}
/* Output */
.output-textbox, .markdown-box {
background-color: var(--surface-2) !important;
border: 1px solid var(--border) !important;
height: 560px !important;
max-height: 560px !important;
}
.markdown-box, .markdown-box * {
color: var(--text) !important;
}
.markdown-box code, .markdown-box pre {
background: #1f2937 !important;
}
"""
# Build Gradio interface
with gr.Blocks(title="UltraData Math Parser", css=custom_css, theme=gr.themes.Soft()) as demo:
gr.HTML('<h1 class="main-title">πŸ“ UltraData Math Parser</h1>')
gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>')
with gr.Row():
with gr.Column(scale=1, elem_classes=["glass-panel"]):
gr.HTML('<div class="section-header">πŸ“₯ Input</div>')
with gr.Tabs():
with gr.TabItem("πŸ”— URL"):
url_input = gr.Textbox(
label="URL",
placeholder="Enter URL to fetch (e.g., https://example.com/math-article)",
lines=3,
max_lines=5,
value="https://math.stackexchange.com/questions/5120625/ode-problem-of-yt-sqrtyt-with-the-inital-value-y0-1-t-geq-0",
)
fetch_btn = gr.Button("πŸ“₯ Fetch & Parse", variant="primary", size="lg")
with gr.TabItem("πŸ“ HTML"):
pass # HTML input will be below, shared between tabs
html_input = gr.Textbox(
label="HTML Content",
placeholder="Paste your HTML content here or fetch from URL above...",
lines=10,
max_lines=20,
value=EXAMPLE_HTML,
)
base_url_input = gr.Textbox(
label="Base URL (Auto-filled from URL fetch)",
placeholder="https://example.com/page",
lines=1,
)
with gr.Accordion("βš™οΈ Advanced Options", open=False):
html_type = gr.Radio(
choices=["unified", "article", "forum"],
value="unified",
label="Parser Type",
info="Select the parsing strategy",
)
process_math = gr.Checkbox(
label="Process Math Expressions",
value=True,
info="Convert MathML and LaTeX to unified format",
)
include_tables = gr.Checkbox(
label="Include Tables",
value=True,
info="Preserve table elements in output",
)
enable_forum = gr.Checkbox(
label="Enable Forum Assembly",
value=True,
info="Assemble forum posts and comments",
)
with gr.Row():
parse_btn = gr.Button("πŸš€ Parse HTML", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="lg")
with gr.Column(scale=1, elem_classes=["glass-panel"]):
gr.HTML('<div class="section-header">πŸ“€ Output</div>')
title_output = gr.Textbox(
label="Extracted Title",
lines=1,
interactive=False,
)
with gr.Tabs():
with gr.TabItem("✨ Markdown"):
markdown_output = gr.Markdown(
value="### Output will appear here...",
label="Markdown Preview",
elem_classes=["markdown-box"],
latex_delimiters=[
{"left": "$$", "right": "$$", "display": True},
{"left": "$", "right": "$", "display": False},
{"left": "\\[", "right": "\\]", "display": True},
{"left": "\\(", "right": "\\)", "display": False},
],
)
with gr.TabItem("πŸ“„ Plain Text"):
text_output = gr.Textbox(
label="Plain Text (w3m rendered)",
lines=25,
max_lines=30,
interactive=False,
autoscroll=False,
elem_classes=["output-textbox"],
)
with gr.TabItem("πŸ“ Raw HTML"):
html_output = gr.Textbox(
label="Extracted HTML",
lines=25,
max_lines=30,
interactive=False,
autoscroll=False,
elem_classes=["output-textbox"],
)
# Event handlers
fetch_btn.click(
fn=fetch_and_parse,
inputs=[url_input, process_math, include_tables, enable_forum, html_type],
outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output],
)
parse_btn.click(
fn=process_input,
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
outputs=[title_output, html_output, text_output, markdown_output],
)
def clear_all():
return "", "", "", "", "", "", ""
clear_btn.click(
fn=clear_all,
outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output],
)
# Footer info
gr.HTML("""
<div class="footer-text">
<p>πŸ”¬ <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p>
<p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p>
</div>
""")
if __name__ == "__main__":
demo.launch(ssr_mode=False)