Spaces:

13ze
/

smart-scrape-html-to-md

Runtime error

File size: 10,381 Bytes

48fcfd2
 
 
 
 
 
9d2b078
48fcfd2
 
9d2b078
48fcfd2
 
 
 
 
 
9d2b078
48fcfd2
 
 
 
9d2b078
48fcfd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d2b078
48fcfd2
9d2b078
48fcfd2
9d2b078
48fcfd2
9d2b078
48fcfd2
 
 
 
 
 
9d2b078
48fcfd2
9d2b078
 
 
 
 
 
 
 
 
 
 
 
48fcfd2
 
 
 
9d2b078
 
48fcfd2
9d2b078
 
 
 
 
 
48fcfd2
9d2b078
 
 
 
48fcfd2
9d2b078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48fcfd2
 
 
9d2b078
48fcfd2
9d2b078
48fcfd2
9f2b249
 
 
9d2b078
 
 
 
 
 
 
48fcfd2
9d2b078
 
48fcfd2
9d2b078
 
48fcfd2
9d2b078
48fcfd2
 
9d2b078
48fcfd2
 
9d2b078
48fcfd2
fd8df2d
6166ec0
 
fd8df2d
 
9d2b078
6166ec0
fd8df2d
 
 
9d2b078
 
 
 
 
 
 
 
1d79c85
 
9f2b249
6166ec0
 
9f2b249
6166ec0
 
 
bd8c0ac
6166ec0
 
 
9f2b249
6166ec0
 
bd8c0ac
 
 
6166ec0
 
9f2b249
6166ec0
bd8c0ac
 
 
6166ec0
 
 
 
bd8c0ac
 
 
9d2b078
 
 
6166ec0
9d2b078
6166ec0
 
 
fd8df2d
9d2b078
 
6166ec0

import gradio as gr
import requests
from markdownify import markdownify
import traceback # To help format potential errors
from readability import Document
from bs4 import BeautifulSoup
import re # Import regex for potentially cleaning readability titles

# Configure requests with a timeout and user-agent
DEFAULT_TIMEOUT = 20
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}

def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converts HTML (from URL or direct input) to Markdown.
    Attempts to extract main content using readability.
    Uses readability title, falls back to first H1 if needed, and prevents duplication.
    Returns the resulting Markdown string or an error message.
    """
    html_content = ""
    source = ""
    use_readability = True

    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # --- Step 1: Get HTML Content ---
        if url:
            source = f"URL ({url})"
            print(f"Attempting to fetch HTML from URL: {url}")
            try:
                if not url.startswith(('http://', 'https://')):
                     url = 'https://' + url
                     print(f"Scheme missing, prepended https://. New URL: {url}")

                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
                response.raise_for_status()
                response.encoding = response.apparent_encoding or 'utf-8'
                html_content = response.text
                print(f"Successfully fetched {len(html_content)} bytes from URL.")
            except requests.exceptions.MissingSchema:
                 return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
            except requests.exceptions.Timeout:
                return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
            except requests.exceptions.RequestException as e:
                return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
            except Exception as e:
                return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"

        elif html_input:
            source = "Direct HTML Input"
            print(f"Using direct HTML input ({len(html_input)} bytes).")
            html_content = html_input
        else:
            return "❓ Please provide a URL or paste HTML content in the fields above."

        # --- Pre-cleaning before Readability ---
        if not html_content: return f"❓ No HTML content found from {source}."
        print("Pre-cleaning HTML...")
        soup_pre = BeautifulSoup(html_content, 'html.parser')
        for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning
            tag.decompose()
        cleaned_html = str(soup_pre) # Use this cleaned version going forward

        # --- Step 2: Extract Main Content and Title (using Readability) ---
        processed_html = cleaned_html # Default to cleaned HTML
        readability_title = None
        final_title = None # <<< Title to be used in the final output

        if use_readability:
            print("Attempting to extract main content using Readability...")
            try:
                doc = Document(cleaned_html) # Use cleaned HTML
                readability_title = doc.title()
                processed_html_summary = doc.summary()

                # Check if readability summary is valid
                soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser')
                if soup_summary_check.text.strip():
                     processed_html = processed_html_summary # Use summary if valid
                     print(f"Readability extracted title: '{readability_title}'. Using summary.")
                else:
                     print("Readability summary was empty. Falling back to cleaned full HTML.")
                     # processed_html remains cleaned_html
                     readability_title = None # Discard title if summary failed

            except Exception as e:
                print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.")
                # processed_html remains cleaned_html
                readability_title = None

        # --- Title Decision Logic ---
        # Priority 1: Readability title (if good)
        if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title
             final_title = readability_title.strip()
             print(f"Using Readability title: '{final_title}'")

        # Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title
        if not final_title:
            print("Readability title not suitable or not found. Looking for H1 fallback...")
            soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser')
            h1_tag = soup_for_h1.find('h1')
            if h1_tag:
                h1_text = h1_tag.get_text(strip=True)
                if h1_text:
                    final_title = h1_text
                    print(f"Using H1 fallback title: '{final_title}'")

        # --- Prevent Title Duplication in Content ---
        if final_title:
            print(f"Checking for title duplication in processed HTML (first H1)...")
            soup_proc = BeautifulSoup(processed_html, 'html.parser')
            first_h1_in_proc = soup_proc.find('h1')
            if first_h1_in_proc:
                h1_proc_text = first_h1_in_proc.get_text(strip=True)
                # Check if the H1 text in content matches the final title we decided on
                if h1_proc_text == final_title:
                    print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
                    first_h1_in_proc.decompose() # Remove the H1 tag
                    processed_html = str(soup_proc) # Update the HTML string to be converted

        # --- Step 3: Convert the Processed HTML to Markdown ---
        if not processed_html.strip():
             return f"❓ The HTML content (after processing) appears to be empty."

        print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...")
        try:
            markdown_output = markdownify(
                processed_html,
                heading_style="ATX",
                bullets='*'
            ).strip() # Strip whitespace from markdown output

            # Assemble final output
            if final_title:
                # Prepend the decided title if one exists
                final_markdown = f"# {final_title}\n\n{markdown_output}"
            else:
                # Otherwise, just use the converted markdown
                final_markdown = markdown_output

            if not final_markdown.strip():
                 return f"ℹ️ The conversion resulted in empty Markdown."

            return final_markdown.strip() # Return final cleaned string

        except Exception as e:
            return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"

    except Exception as e:
        return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"


# --- Gradio Interface (Standard) ---
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown.
The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
"""
article = """
**How it works:**
1.  Fetches HTML from URL or uses pasted input.
2.  Performs basic cleaning (removes scripts, styles, headers, footers, etc.).
3.  Uses `readability-lxml` to extract the main content and attempt to find a page title.
4.  **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback.
5.  **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice.
6.  Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown.
7.  Prepends the determined title (if any) to the final Markdown output.
8.  Displays the raw Markdown code in the output box with a copy button.
"""

# Define input components
url_input = gr.Textbox(
    label="Enter URL (gets priority)",
    placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
)
html_input_area = gr.Textbox(
    label="Or Paste HTML Code Here",
    lines=10,
    placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
)

# Define output component as Textbox
markdown_output_textbox = gr.Textbox(
    label="Converted Markdown Code Output",
    lines=20,
    interactive=False,
    show_copy_button=True
)

# Create the standard Gradio interface
iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input_area],
    outputs=markdown_output_textbox,
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples=[
        ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
        ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
        # Add an example without H1 to test no-title scenario
        ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    # Reminder: requirements.txt includes:
    # gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean]
    iface.launch()