Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from markdownify import markdownify | |
import traceback # To help format potential errors | |
from readability import Document | |
from bs4 import BeautifulSoup | |
import re # Import regex for potentially cleaning readability titles | |
# Configure requests with a timeout and user-agent | |
DEFAULT_TIMEOUT = 20 | |
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} | |
def html_to_markdown_converter(url: str, html_input: str) -> str: | |
""" | |
Converts HTML (from URL or direct input) to Markdown. | |
Attempts to extract main content using readability. | |
Uses readability title, falls back to first H1 if needed, and prevents duplication. | |
Returns the resulting Markdown string or an error message. | |
""" | |
html_content = "" | |
source = "" | |
use_readability = True | |
url = url.strip() if url else "" | |
html_input = html_input.strip() if html_input else "" | |
try: | |
# --- Step 1: Get HTML Content --- | |
if url: | |
source = f"URL ({url})" | |
print(f"Attempting to fetch HTML from URL: {url}") | |
try: | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
print(f"Scheme missing, prepended https://. New URL: {url}") | |
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) | |
response.raise_for_status() | |
response.encoding = response.apparent_encoding or 'utf-8' | |
html_content = response.text | |
print(f"Successfully fetched {len(html_content)} bytes from URL.") | |
except requests.exceptions.MissingSchema: | |
return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." | |
except requests.exceptions.Timeout: | |
return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" | |
except requests.exceptions.RequestException as e: | |
return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```" | |
except Exception as e: | |
return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" | |
elif html_input: | |
source = "Direct HTML Input" | |
print(f"Using direct HTML input ({len(html_input)} bytes).") | |
html_content = html_input | |
else: | |
return "❓ Please provide a URL or paste HTML content in the fields above." | |
# --- Pre-cleaning before Readability --- | |
if not html_content: return f"❓ No HTML content found from {source}." | |
print("Pre-cleaning HTML...") | |
soup_pre = BeautifulSoup(html_content, 'html.parser') | |
for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning | |
tag.decompose() | |
cleaned_html = str(soup_pre) # Use this cleaned version going forward | |
# --- Step 2: Extract Main Content and Title (using Readability) --- | |
processed_html = cleaned_html # Default to cleaned HTML | |
readability_title = None | |
final_title = None # <<< Title to be used in the final output | |
if use_readability: | |
print("Attempting to extract main content using Readability...") | |
try: | |
doc = Document(cleaned_html) # Use cleaned HTML | |
readability_title = doc.title() | |
processed_html_summary = doc.summary() | |
# Check if readability summary is valid | |
soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser') | |
if soup_summary_check.text.strip(): | |
processed_html = processed_html_summary # Use summary if valid | |
print(f"Readability extracted title: '{readability_title}'. Using summary.") | |
else: | |
print("Readability summary was empty. Falling back to cleaned full HTML.") | |
# processed_html remains cleaned_html | |
readability_title = None # Discard title if summary failed | |
except Exception as e: | |
print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.") | |
# processed_html remains cleaned_html | |
readability_title = None | |
# --- Title Decision Logic --- | |
# Priority 1: Readability title (if good) | |
if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title | |
final_title = readability_title.strip() | |
print(f"Using Readability title: '{final_title}'") | |
# Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title | |
if not final_title: | |
print("Readability title not suitable or not found. Looking for H1 fallback...") | |
soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser') | |
h1_tag = soup_for_h1.find('h1') | |
if h1_tag: | |
h1_text = h1_tag.get_text(strip=True) | |
if h1_text: | |
final_title = h1_text | |
print(f"Using H1 fallback title: '{final_title}'") | |
# --- Prevent Title Duplication in Content --- | |
if final_title: | |
print(f"Checking for title duplication in processed HTML (first H1)...") | |
soup_proc = BeautifulSoup(processed_html, 'html.parser') | |
first_h1_in_proc = soup_proc.find('h1') | |
if first_h1_in_proc: | |
h1_proc_text = first_h1_in_proc.get_text(strip=True) | |
# Check if the H1 text in content matches the final title we decided on | |
if h1_proc_text == final_title: | |
print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.") | |
first_h1_in_proc.decompose() # Remove the H1 tag | |
processed_html = str(soup_proc) # Update the HTML string to be converted | |
# --- Step 3: Convert the Processed HTML to Markdown --- | |
if not processed_html.strip(): | |
return f"❓ The HTML content (after processing) appears to be empty." | |
print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...") | |
try: | |
markdown_output = markdownify( | |
processed_html, | |
heading_style="ATX", | |
bullets='*' | |
).strip() # Strip whitespace from markdown output | |
# Assemble final output | |
if final_title: | |
# Prepend the decided title if one exists | |
final_markdown = f"# {final_title}\n\n{markdown_output}" | |
else: | |
# Otherwise, just use the converted markdown | |
final_markdown = markdown_output | |
if not final_markdown.strip(): | |
return f"ℹ️ The conversion resulted in empty Markdown." | |
return final_markdown.strip() # Return final cleaned string | |
except Exception as e: | |
return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" | |
except Exception as e: | |
return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" | |
# --- Gradio Interface (Standard) --- | |
title = "HTML to Markdown Converter (Smart Extraction)" | |
description = """ | |
Enter a URL **or** paste HTML code directly into the text box below. | |
The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown. | |
The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code. | |
""" | |
article = """ | |
**How it works:** | |
1. Fetches HTML from URL or uses pasted input. | |
2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.). | |
3. Uses `readability-lxml` to extract the main content and attempt to find a page title. | |
4. **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback. | |
5. **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice. | |
6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown. | |
7. Prepends the determined title (if any) to the final Markdown output. | |
8. Displays the raw Markdown code in the output box with a copy button. | |
""" | |
# Define input components | |
url_input = gr.Textbox( | |
label="Enter URL (gets priority)", | |
placeholder="e.g., https://en.wikipedia.org/wiki/Markdown" | |
) | |
html_input_area = gr.Textbox( | |
label="Or Paste HTML Code Here", | |
lines=10, | |
placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>" | |
) | |
# Define output component as Textbox | |
markdown_output_textbox = gr.Textbox( | |
label="Converted Markdown Code Output", | |
lines=20, | |
interactive=False, | |
show_copy_button=True | |
) | |
# Create the standard Gradio interface | |
iface = gr.Interface( | |
fn=html_to_markdown_converter, | |
inputs=[url_input, html_input_area], | |
outputs=markdown_output_textbox, | |
title=title, | |
description=description, | |
article=article, | |
allow_flagging='never', | |
examples=[ | |
["https://psychedelic.com.br/profissoes-boneca-barbie/", ""], | |
["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""], | |
["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"], | |
# Add an example without H1 to test no-title scenario | |
["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"] | |
], | |
cache_examples=False | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
# Reminder: requirements.txt includes: | |
# gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean] | |
iface.launch() |