Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import requests | |
| from markdownify import markdownify | |
| import traceback # To help format potential errors | |
| from readability import Document | |
| from bs4 import BeautifulSoup | |
| import re # Import regex for potentially cleaning readability titles | |
| # Configure requests with a timeout and user-agent | |
| DEFAULT_TIMEOUT = 20 | |
| HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} | |
| def html_to_markdown_converter(url: str, html_input: str) -> str: | |
| """ | |
| Converts HTML (from URL or direct input) to Markdown. | |
| Attempts to extract main content using readability. | |
| Uses readability title, falls back to first H1 if needed, and prevents duplication. | |
| Returns the resulting Markdown string or an error message. | |
| """ | |
| html_content = "" | |
| source = "" | |
| use_readability = True | |
| url = url.strip() if url else "" | |
| html_input = html_input.strip() if html_input else "" | |
| try: | |
| # --- Step 1: Get HTML Content --- | |
| if url: | |
| source = f"URL ({url})" | |
| print(f"Attempting to fetch HTML from URL: {url}") | |
| try: | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| print(f"Scheme missing, prepended https://. New URL: {url}") | |
| response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) | |
| response.raise_for_status() | |
| response.encoding = response.apparent_encoding or 'utf-8' | |
| html_content = response.text | |
| print(f"Successfully fetched {len(html_content)} bytes from URL.") | |
| except requests.exceptions.MissingSchema: | |
| return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." | |
| except requests.exceptions.Timeout: | |
| return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" | |
| except requests.exceptions.RequestException as e: | |
| return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```" | |
| except Exception as e: | |
| return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" | |
| elif html_input: | |
| source = "Direct HTML Input" | |
| print(f"Using direct HTML input ({len(html_input)} bytes).") | |
| html_content = html_input | |
| else: | |
| return "❓ Please provide a URL or paste HTML content in the fields above." | |
| # --- Pre-cleaning before Readability --- | |
| if not html_content: return f"❓ No HTML content found from {source}." | |
| print("Pre-cleaning HTML...") | |
| soup_pre = BeautifulSoup(html_content, 'html.parser') | |
| for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning | |
| tag.decompose() | |
| cleaned_html = str(soup_pre) # Use this cleaned version going forward | |
| # --- Step 2: Extract Main Content and Title (using Readability) --- | |
| processed_html = cleaned_html # Default to cleaned HTML | |
| readability_title = None | |
| final_title = None # <<< Title to be used in the final output | |
| if use_readability: | |
| print("Attempting to extract main content using Readability...") | |
| try: | |
| doc = Document(cleaned_html) # Use cleaned HTML | |
| readability_title = doc.title() | |
| processed_html_summary = doc.summary() | |
| # Check if readability summary is valid | |
| soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser') | |
| if soup_summary_check.text.strip(): | |
| processed_html = processed_html_summary # Use summary if valid | |
| print(f"Readability extracted title: '{readability_title}'. Using summary.") | |
| else: | |
| print("Readability summary was empty. Falling back to cleaned full HTML.") | |
| # processed_html remains cleaned_html | |
| readability_title = None # Discard title if summary failed | |
| except Exception as e: | |
| print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.") | |
| # processed_html remains cleaned_html | |
| readability_title = None | |
| # --- Title Decision Logic --- | |
| # Priority 1: Readability title (if good) | |
| if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title | |
| final_title = readability_title.strip() | |
| print(f"Using Readability title: '{final_title}'") | |
| # Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title | |
| if not final_title: | |
| print("Readability title not suitable or not found. Looking for H1 fallback...") | |
| soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser') | |
| h1_tag = soup_for_h1.find('h1') | |
| if h1_tag: | |
| h1_text = h1_tag.get_text(strip=True) | |
| if h1_text: | |
| final_title = h1_text | |
| print(f"Using H1 fallback title: '{final_title}'") | |
| # --- Prevent Title Duplication in Content --- | |
| if final_title: | |
| print(f"Checking for title duplication in processed HTML (first H1)...") | |
| soup_proc = BeautifulSoup(processed_html, 'html.parser') | |
| first_h1_in_proc = soup_proc.find('h1') | |
| if first_h1_in_proc: | |
| h1_proc_text = first_h1_in_proc.get_text(strip=True) | |
| # Check if the H1 text in content matches the final title we decided on | |
| if h1_proc_text == final_title: | |
| print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.") | |
| first_h1_in_proc.decompose() # Remove the H1 tag | |
| processed_html = str(soup_proc) # Update the HTML string to be converted | |
| # --- Step 3: Convert the Processed HTML to Markdown --- | |
| if not processed_html.strip(): | |
| return f"❓ The HTML content (after processing) appears to be empty." | |
| print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...") | |
| try: | |
| markdown_output = markdownify( | |
| processed_html, | |
| heading_style="ATX", | |
| bullets='*' | |
| ).strip() # Strip whitespace from markdown output | |
| # Assemble final output | |
| if final_title: | |
| # Prepend the decided title if one exists | |
| final_markdown = f"# {final_title}\n\n{markdown_output}" | |
| else: | |
| # Otherwise, just use the converted markdown | |
| final_markdown = markdown_output | |
| if not final_markdown.strip(): | |
| return f"ℹ️ The conversion resulted in empty Markdown." | |
| return final_markdown.strip() # Return final cleaned string | |
| except Exception as e: | |
| return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" | |
| except Exception as e: | |
| return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" | |
| # --- Gradio Interface (Standard) --- | |
| title = "HTML to Markdown Converter (Smart Extraction)" | |
| description = """ | |
| Enter a URL **or** paste HTML code directly into the text box below. | |
| The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown. | |
| The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code. | |
| """ | |
| article = """ | |
| **How it works:** | |
| 1. Fetches HTML from URL or uses pasted input. | |
| 2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.). | |
| 3. Uses `readability-lxml` to extract the main content and attempt to find a page title. | |
| 4. **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback. | |
| 5. **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice. | |
| 6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown. | |
| 7. Prepends the determined title (if any) to the final Markdown output. | |
| 8. Displays the raw Markdown code in the output box with a copy button. | |
| """ | |
| # Define input components | |
| url_input = gr.Textbox( | |
| label="Enter URL (gets priority)", | |
| placeholder="e.g., https://en.wikipedia.org/wiki/Markdown" | |
| ) | |
| html_input_area = gr.Textbox( | |
| label="Or Paste HTML Code Here", | |
| lines=10, | |
| placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>" | |
| ) | |
| # Define output component as Textbox | |
| markdown_output_textbox = gr.Textbox( | |
| label="Converted Markdown Code Output", | |
| lines=20, | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| # Create the standard Gradio interface | |
| iface = gr.Interface( | |
| fn=html_to_markdown_converter, | |
| inputs=[url_input, html_input_area], | |
| outputs=markdown_output_textbox, | |
| title=title, | |
| description=description, | |
| article=article, | |
| allow_flagging='never', | |
| examples=[ | |
| ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""], | |
| ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""], | |
| ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"], | |
| # Add an example without H1 to test no-title scenario | |
| ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"] | |
| ], | |
| cache_examples=False | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| # Reminder: requirements.txt includes: | |
| # gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean] | |
| iface.launch() |