13ze's picture
Update app.py
9d2b078 verified
import gradio as gr
import requests
from markdownify import markdownify
import traceback # To help format potential errors
from readability import Document
from bs4 import BeautifulSoup
import re # Import regex for potentially cleaning readability titles
# Configure requests with a timeout and user-agent
DEFAULT_TIMEOUT = 20
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
def html_to_markdown_converter(url: str, html_input: str) -> str:
"""
Converts HTML (from URL or direct input) to Markdown.
Attempts to extract main content using readability.
Uses readability title, falls back to first H1 if needed, and prevents duplication.
Returns the resulting Markdown string or an error message.
"""
html_content = ""
source = ""
use_readability = True
url = url.strip() if url else ""
html_input = html_input.strip() if html_input else ""
try:
# --- Step 1: Get HTML Content ---
if url:
source = f"URL ({url})"
print(f"Attempting to fetch HTML from URL: {url}")
try:
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
print(f"Scheme missing, prepended https://. New URL: {url}")
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding or 'utf-8'
html_content = response.text
print(f"Successfully fetched {len(html_content)} bytes from URL.")
except requests.exceptions.MissingSchema:
return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
except requests.exceptions.Timeout:
return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
except requests.exceptions.RequestException as e:
return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
except Exception as e:
return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
elif html_input:
source = "Direct HTML Input"
print(f"Using direct HTML input ({len(html_input)} bytes).")
html_content = html_input
else:
return "❓ Please provide a URL or paste HTML content in the fields above."
# --- Pre-cleaning before Readability ---
if not html_content: return f"❓ No HTML content found from {source}."
print("Pre-cleaning HTML...")
soup_pre = BeautifulSoup(html_content, 'html.parser')
for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning
tag.decompose()
cleaned_html = str(soup_pre) # Use this cleaned version going forward
# --- Step 2: Extract Main Content and Title (using Readability) ---
processed_html = cleaned_html # Default to cleaned HTML
readability_title = None
final_title = None # <<< Title to be used in the final output
if use_readability:
print("Attempting to extract main content using Readability...")
try:
doc = Document(cleaned_html) # Use cleaned HTML
readability_title = doc.title()
processed_html_summary = doc.summary()
# Check if readability summary is valid
soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser')
if soup_summary_check.text.strip():
processed_html = processed_html_summary # Use summary if valid
print(f"Readability extracted title: '{readability_title}'. Using summary.")
else:
print("Readability summary was empty. Falling back to cleaned full HTML.")
# processed_html remains cleaned_html
readability_title = None # Discard title if summary failed
except Exception as e:
print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.")
# processed_html remains cleaned_html
readability_title = None
# --- Title Decision Logic ---
# Priority 1: Readability title (if good)
if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title
final_title = readability_title.strip()
print(f"Using Readability title: '{final_title}'")
# Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title
if not final_title:
print("Readability title not suitable or not found. Looking for H1 fallback...")
soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser')
h1_tag = soup_for_h1.find('h1')
if h1_tag:
h1_text = h1_tag.get_text(strip=True)
if h1_text:
final_title = h1_text
print(f"Using H1 fallback title: '{final_title}'")
# --- Prevent Title Duplication in Content ---
if final_title:
print(f"Checking for title duplication in processed HTML (first H1)...")
soup_proc = BeautifulSoup(processed_html, 'html.parser')
first_h1_in_proc = soup_proc.find('h1')
if first_h1_in_proc:
h1_proc_text = first_h1_in_proc.get_text(strip=True)
# Check if the H1 text in content matches the final title we decided on
if h1_proc_text == final_title:
print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
first_h1_in_proc.decompose() # Remove the H1 tag
processed_html = str(soup_proc) # Update the HTML string to be converted
# --- Step 3: Convert the Processed HTML to Markdown ---
if not processed_html.strip():
return f"❓ The HTML content (after processing) appears to be empty."
print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...")
try:
markdown_output = markdownify(
processed_html,
heading_style="ATX",
bullets='*'
).strip() # Strip whitespace from markdown output
# Assemble final output
if final_title:
# Prepend the decided title if one exists
final_markdown = f"# {final_title}\n\n{markdown_output}"
else:
# Otherwise, just use the converted markdown
final_markdown = markdown_output
if not final_markdown.strip():
return f"ℹ️ The conversion resulted in empty Markdown."
return final_markdown.strip() # Return final cleaned string
except Exception as e:
return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
except Exception as e:
return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
# --- Gradio Interface (Standard) ---
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown.
The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
"""
article = """
**How it works:**
1. Fetches HTML from URL or uses pasted input.
2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.).
3. Uses `readability-lxml` to extract the main content and attempt to find a page title.
4. **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback.
5. **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice.
6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown.
7. Prepends the determined title (if any) to the final Markdown output.
8. Displays the raw Markdown code in the output box with a copy button.
"""
# Define input components
url_input = gr.Textbox(
label="Enter URL (gets priority)",
placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
)
html_input_area = gr.Textbox(
label="Or Paste HTML Code Here",
lines=10,
placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
)
# Define output component as Textbox
markdown_output_textbox = gr.Textbox(
label="Converted Markdown Code Output",
lines=20,
interactive=False,
show_copy_button=True
)
# Create the standard Gradio interface
iface = gr.Interface(
fn=html_to_markdown_converter,
inputs=[url_input, html_input_area],
outputs=markdown_output_textbox,
title=title,
description=description,
article=article,
allow_flagging='never',
examples=[
["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
# Add an example without H1 to test no-title scenario
["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
],
cache_examples=False
)
# Launch the app
if __name__ == "__main__":
# Reminder: requirements.txt includes:
# gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean]
iface.launch()