File size: 2,507 Bytes
fb83c5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import argparse
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from html2text import html2text
from pathlib import Path

def is_writable_path(target_path):
    """

    Check if a path is writable.

    """
    path = Path(os.path.dirname(target_path))
    if path.is_dir():
        if os.access(path, os.W_OK):
            return target_path
        else:
            raise argparse.ArgumentTypeError(f"Directory '{path}' is not writable.")
    else:
        raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist.")

def main(url, markdown_path):
    # Create a session object
    with requests.Session() as session:
        # Send HTTP request to the specified URL
        response = session.get(url)
        response.raise_for_status()  # Check for HTTP issues

        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(response.text, 'html.parser')

        # Ensure the directory for saving images exists
        os.makedirs("./logs", exist_ok=True)

        # Find all image tags and save images
        for image in soup.find_all('img'):
            image_url = urljoin(url, image['src'])
            try:
                image_response = session.get(image_url, stream=True)
                image_response.raise_for_status()
                image_name = os.path.join("./logs", os.path.basename(image_url))
                with open(image_name, 'wb') as file:
                    file.write(image_response.content)
            except requests.RequestException as e:
                print(f"Failed to download {image_url}: {e}")

        # Convert the HTML content to markdown
        markdown_content = html2text(response.text)

        # Save the markdown content to a file
        try:
            with open(markdown_path, "w", encoding="utf8") as file:
                file.write(markdown_content)
            print(f"Markdown content successfully written to {markdown_path}")
        except Exception as e:
            print(f"Failed to write markdown to {markdown_path}: {e}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert HTML to Markdown")
    parser.add_argument("url", help="The URL of the webpage to convert")
    parser.add_argument("markdown_path", help="The path to save the converted markdown file", type=is_writable_path)
    args = parser.parse_args()

    main(args.url, args.markdown_path)