|
import argparse
|
|
import os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
from html2text import html2text
|
|
from pathlib import Path
|
|
|
|
def is_writable_path(target_path):
|
|
"""
|
|
Check if a path is writable.
|
|
"""
|
|
path = Path(os.path.dirname(target_path))
|
|
if path.is_dir():
|
|
if os.access(path, os.W_OK):
|
|
return target_path
|
|
else:
|
|
raise argparse.ArgumentTypeError(f"Directory '{path}' is not writable.")
|
|
else:
|
|
raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist.")
|
|
|
|
def main(url, markdown_path):
|
|
|
|
with requests.Session() as session:
|
|
|
|
response = session.get(url)
|
|
response.raise_for_status()
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
os.makedirs("./logs", exist_ok=True)
|
|
|
|
|
|
for image in soup.find_all('img'):
|
|
image_url = urljoin(url, image['src'])
|
|
try:
|
|
image_response = session.get(image_url, stream=True)
|
|
image_response.raise_for_status()
|
|
image_name = os.path.join("./logs", os.path.basename(image_url))
|
|
with open(image_name, 'wb') as file:
|
|
file.write(image_response.content)
|
|
except requests.RequestException as e:
|
|
print(f"Failed to download {image_url}: {e}")
|
|
|
|
|
|
markdown_content = html2text(response.text)
|
|
|
|
|
|
try:
|
|
with open(markdown_path, "w", encoding="utf8") as file:
|
|
file.write(markdown_content)
|
|
print(f"Markdown content successfully written to {markdown_path}")
|
|
except Exception as e:
|
|
print(f"Failed to write markdown to {markdown_path}: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Convert HTML to Markdown")
|
|
parser.add_argument("url", help="The URL of the webpage to convert")
|
|
parser.add_argument("markdown_path", help="The path to save the converted markdown file", type=is_writable_path)
|
|
args = parser.parse_args()
|
|
|
|
main(args.url, args.markdown_path)
|
|
|