import gradio as gr import requests from markdownify import markdownify import traceback # To help format potential errors from readability import Document from bs4 import BeautifulSoup import re # Import regex for potentially cleaning readability titles # Configure requests with a timeout and user-agent DEFAULT_TIMEOUT = 20 HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} def html_to_markdown_converter(url: str, html_input: str) -> str: """ Converts HTML (from URL or direct input) to Markdown. Attempts to extract main content using readability. Uses readability title, falls back to first H1 if needed, and prevents duplication. Returns the resulting Markdown string or an error message. """ html_content = "" source = "" use_readability = True url = url.strip() if url else "" html_input = html_input.strip() if html_input else "" try: # --- Step 1: Get HTML Content --- if url: source = f"URL ({url})" print(f"Attempting to fetch HTML from URL: {url}") try: if not url.startswith(('http://', 'https://')): url = 'https://' + url print(f"Scheme missing, prepended https://. New URL: {url}") response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) response.raise_for_status() response.encoding = response.apparent_encoding or 'utf-8' html_content = response.text print(f"Successfully fetched {len(html_content)} bytes from URL.") except requests.exceptions.MissingSchema: return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." except requests.exceptions.Timeout: return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" except requests.exceptions.RequestException as e: return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```" except Exception as e: return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" elif html_input: source = "Direct HTML Input" print(f"Using direct HTML input ({len(html_input)} bytes).") html_content = html_input else: return "❓ Please provide a URL or paste HTML content in the fields above." # --- Pre-cleaning before Readability --- if not html_content: return f"❓ No HTML content found from {source}." print("Pre-cleaning HTML...") soup_pre = BeautifulSoup(html_content, 'html.parser') for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning tag.decompose() cleaned_html = str(soup_pre) # Use this cleaned version going forward # --- Step 2: Extract Main Content and Title (using Readability) --- processed_html = cleaned_html # Default to cleaned HTML readability_title = None final_title = None # <<< Title to be used in the final output if use_readability: print("Attempting to extract main content using Readability...") try: doc = Document(cleaned_html) # Use cleaned HTML readability_title = doc.title() processed_html_summary = doc.summary() # Check if readability summary is valid soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser') if soup_summary_check.text.strip(): processed_html = processed_html_summary # Use summary if valid print(f"Readability extracted title: '{readability_title}'. Using summary.") else: print("Readability summary was empty. Falling back to cleaned full HTML.") # processed_html remains cleaned_html readability_title = None # Discard title if summary failed except Exception as e: print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.") # processed_html remains cleaned_html readability_title = None # --- Title Decision Logic --- # Priority 1: Readability title (if good) if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title final_title = readability_title.strip() print(f"Using Readability title: '{final_title}'") # Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title if not final_title: print("Readability title not suitable or not found. Looking for H1 fallback...") soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser') h1_tag = soup_for_h1.find('h1') if h1_tag: h1_text = h1_tag.get_text(strip=True) if h1_text: final_title = h1_text print(f"Using H1 fallback title: '{final_title}'") # --- Prevent Title Duplication in Content --- if final_title: print(f"Checking for title duplication in processed HTML (first H1)...") soup_proc = BeautifulSoup(processed_html, 'html.parser') first_h1_in_proc = soup_proc.find('h1') if first_h1_in_proc: h1_proc_text = first_h1_in_proc.get_text(strip=True) # Check if the H1 text in content matches the final title we decided on if h1_proc_text == final_title: print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.") first_h1_in_proc.decompose() # Remove the H1 tag processed_html = str(soup_proc) # Update the HTML string to be converted # --- Step 3: Convert the Processed HTML to Markdown --- if not processed_html.strip(): return f"❓ The HTML content (after processing) appears to be empty." print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...") try: markdown_output = markdownify( processed_html, heading_style="ATX", bullets='*' ).strip() # Strip whitespace from markdown output # Assemble final output if final_title: # Prepend the decided title if one exists final_markdown = f"# {final_title}\n\n{markdown_output}" else: # Otherwise, just use the converted markdown final_markdown = markdown_output if not final_markdown.strip(): return f"ℹ️ The conversion resulted in empty Markdown." return final_markdown.strip() # Return final cleaned string except Exception as e: return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" except Exception as e: return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" # --- Gradio Interface (Standard) --- title = "HTML to Markdown Converter (Smart Extraction)" description = """ Enter a URL **or** paste HTML code directly into the text box below. The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown. The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code. """ article = """ **How it works:** 1. Fetches HTML from URL or uses pasted input. 2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.). 3. Uses `readability-lxml` to extract the main content and attempt to find a page title. 4. **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `

` tag in the cleaned HTML as a fallback. 5. **Deduplication:** If a title is determined, the tool checks if the *first* `

` tag within the extracted main content matches this title. If so, it removes that `

` tag *before* conversion to prevent the title appearing twice. 6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown. 7. Prepends the determined title (if any) to the final Markdown output. 8. Displays the raw Markdown code in the output box with a copy button. """ # Define input components url_input = gr.Textbox( label="Enter URL (gets priority)", placeholder="e.g., https://en.wikipedia.org/wiki/Markdown" ) html_input_area = gr.Textbox( label="Or Paste HTML Code Here", lines=10, placeholder="e.g.,

Hello

This is bold.

" ) # Define output component as Textbox markdown_output_textbox = gr.Textbox( label="Converted Markdown Code Output", lines=20, interactive=False, show_copy_button=True ) # Create the standard Gradio interface iface = gr.Interface( fn=html_to_markdown_converter, inputs=[url_input, html_input_area], outputs=markdown_output_textbox, title=title, description=description, article=article, allow_flagging='never', examples=[ ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""], ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""], ["", "

Título Simples

\n

Este é um parágrafo de exemplo com texto em negrito e texto em itálico.

\n"], # Add an example without H1 to test no-title scenario ["", "

Um parágrafo sem título H1.

Outro conteúdo.

"] ], cache_examples=False ) # Launch the app if __name__ == "__main__": # Reminder: requirements.txt includes: # gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean] iface.launch()