Spaces:

13ze
/

smart-scrape-html-to-md

Runtime error

App Files Files Community

smart-scrape-html-to-md / app.py

13ze

Update app.py

9d2b078 verified 7 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	import requests
	from markdownify import markdownify
	import traceback # To help format potential errors
	from readability import Document
	from bs4 import BeautifulSoup
	import re # Import regex for potentially cleaning readability titles

	# Configure requests with a timeout and user-agent
	DEFAULT_TIMEOUT = 20
	HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}

	def html_to_markdown_converter(url: str, html_input: str) -> str:
	"""
	Converts HTML (from URL or direct input) to Markdown.
	Attempts to extract main content using readability.
	Uses readability title, falls back to first H1 if needed, and prevents duplication.
	Returns the resulting Markdown string or an error message.
	"""
	html_content = ""
	source = ""
	use_readability = True

	url = url.strip() if url else ""
	html_input = html_input.strip() if html_input else ""

	try:
	# --- Step 1: Get HTML Content ---
	if url:
	source = f"URL ({url})"
	print(f"Attempting to fetch HTML from URL: {url}")
	try:
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	print(f"Scheme missing, prepended https://. New URL: {url}")

	response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
	response.raise_for_status()
	response.encoding = response.apparent_encoding or 'utf-8'
	html_content = response.text
	print(f"Successfully fetched {len(html_content)} bytes from URL.")
	except requests.exceptions.MissingSchema:
	return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
	except requests.exceptions.Timeout:
	return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
	except requests.exceptions.RequestException as e:
	return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
	except Exception as e:
	return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"

	elif html_input:
	source = "Direct HTML Input"
	print(f"Using direct HTML input ({len(html_input)} bytes).")
	html_content = html_input
	else:
	return "❓ Please provide a URL or paste HTML content in the fields above."

	# --- Pre-cleaning before Readability ---
	if not html_content: return f"❓ No HTML content found from {source}."
	print("Pre-cleaning HTML...")
	soup_pre = BeautifulSoup(html_content, 'html.parser')
	for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning
	tag.decompose()
	cleaned_html = str(soup_pre) # Use this cleaned version going forward

	# --- Step 2: Extract Main Content and Title (using Readability) ---
	processed_html = cleaned_html # Default to cleaned HTML
	readability_title = None
	final_title = None # <<< Title to be used in the final output

	if use_readability:
	print("Attempting to extract main content using Readability...")
	try:
	doc = Document(cleaned_html) # Use cleaned HTML
	readability_title = doc.title()
	processed_html_summary = doc.summary()

	# Check if readability summary is valid
	soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser')
	if soup_summary_check.text.strip():
	processed_html = processed_html_summary # Use summary if valid
	print(f"Readability extracted title: '{readability_title}'. Using summary.")
	else:
	print("Readability summary was empty. Falling back to cleaned full HTML.")
	# processed_html remains cleaned_html
	readability_title = None # Discard title if summary failed

	except Exception as e:
	print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.")
	# processed_html remains cleaned_html
	readability_title = None

	# --- Title Decision Logic ---
	# Priority 1: Readability title (if good)
	if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title
	final_title = readability_title.strip()
	print(f"Using Readability title: '{final_title}'")

	# Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title
	if not final_title:
	print("Readability title not suitable or not found. Looking for H1 fallback...")
	soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser')
	h1_tag = soup_for_h1.find('h1')
	if h1_tag:
	h1_text = h1_tag.get_text(strip=True)
	if h1_text:
	final_title = h1_text
	print(f"Using H1 fallback title: '{final_title}'")

	# --- Prevent Title Duplication in Content ---
	if final_title:
	print(f"Checking for title duplication in processed HTML (first H1)...")
	soup_proc = BeautifulSoup(processed_html, 'html.parser')
	first_h1_in_proc = soup_proc.find('h1')
	if first_h1_in_proc:
	h1_proc_text = first_h1_in_proc.get_text(strip=True)
	# Check if the H1 text in content matches the final title we decided on
	if h1_proc_text == final_title:
	print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
	first_h1_in_proc.decompose() # Remove the H1 tag
	processed_html = str(soup_proc) # Update the HTML string to be converted

	# --- Step 3: Convert the Processed HTML to Markdown ---
	if not processed_html.strip():
	return f"❓ The HTML content (after processing) appears to be empty."

	print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...")
	try:
	markdown_output = markdownify(
	processed_html,
	heading_style="ATX",
	bullets='*'
	).strip() # Strip whitespace from markdown output

	# Assemble final output
	if final_title:
	# Prepend the decided title if one exists
	final_markdown = f"# {final_title}\n\n{markdown_output}"
	else:
	# Otherwise, just use the converted markdown
	final_markdown = markdown_output

	if not final_markdown.strip():
	return f"ℹ️ The conversion resulted in empty Markdown."

	return final_markdown.strip() # Return final cleaned string

	except Exception as e:
	return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"

	except Exception as e:
	return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"


	# --- Gradio Interface (Standard) ---
	title = "HTML to Markdown Converter (Smart Extraction)"
	description = """
	Enter a URL or paste HTML code directly into the text box below.
	The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown.
	The resulting Markdown code is displayed below. Use the copy icon (📋) in the output box to copy the code.
	"""
	article = """
	How it works:
	1. Fetches HTML from URL or uses pasted input.
	2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.).
	3. Uses `readability-lxml` to extract the main content and attempt to find a page title.
	4. Title Logic: Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback.
	5. Deduplication: If a title is determined, the tool checks if the first `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag before conversion to prevent the title appearing twice.
	6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown.
	7. Prepends the determined title (if any) to the final Markdown output.
	8. Displays the raw Markdown code in the output box with a copy button.
	"""

	# Define input components
	url_input = gr.Textbox(
	label="Enter URL (gets priority)",
	placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
	)
	html_input_area = gr.Textbox(
	label="Or Paste HTML Code Here",
	lines=10,
	placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
	)

	# Define output component as Textbox
	markdown_output_textbox = gr.Textbox(
	label="Converted Markdown Code Output",
	lines=20,
	interactive=False,
	show_copy_button=True
	)

	# Create the standard Gradio interface
	iface = gr.Interface(
	fn=html_to_markdown_converter,
	inputs=[url_input, html_input_area],
	outputs=markdown_output_textbox,
	title=title,
	description=description,
	article=article,
	allow_flagging='never',
	examples=[
	["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
	["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
	["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
	# Add an example without H1 to test no-title scenario
	["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
	],
	cache_examples=False
	)

	# Launch the app
	if __name__ == "__main__":
	# Reminder: requirements.txt includes:
	# gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean]
	iface.launch()