File size: 3,081 Bytes
64a0dbf
 
 
62ee9e4
64a0dbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9c42cf
64a0dbf
f9c42cf
64a0dbf
 
 
 
 
62ee9e4
 
64a0dbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9c42cf
62ee9e4
64a0dbf
 
 
 
 
 
 
 
b259fec
64a0dbf
 
 
f9c42cf
 
64a0dbf
 
 
62ee9e4
64a0dbf
 
 
 
f9c42cf
 
64a0dbf
 
 
 
 
 
b259fec
64a0dbf
 
b259fec
 
 
 
f9c42cf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import platform
import re
from pathlib import Path
from urllib.parse import quote, unquote


# What characters are forbidden in Windows and Linux directory names?
#   https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names

INVALID_FILE_PATH_CHARS = [
    "\\",
    "/",
    ":",
    "*",
    "?",
    '"',
    "<",
    ">",
    "|",
    "\n",
    "\t",
    "\r",
    *[chr(i) for i in range(32)],
]

WINDOWS_INVALID_FILE_PATH_NAMES = [
    "con",
    "prn",
    "aux",
    "nul",
    *[f"com{i+1}" for i in range(10)],
    *[f"lpt{i+1}" for i in range(10)],
]


class FilepathConverter:
    def __init__(self, parent: str = None):
        self.output_root = Path(__file__).parents[1] / "files"
        self.parent = parent

    def preprocess(self, input_string):
        return input_string

    def validate(self, input_string):
        if not input_string:
            return input_string
        filename = input_string
        for char in INVALID_FILE_PATH_CHARS:
            filename = filename.replace(char, "_")
        if platform.system() == "Windows":
            filename_base = filename.split(".")[0]
            if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
                filename_base = filename_base + "_"
                filename = ".".join([filename_base, *filename.split(".")[1:]])
        return filename

    def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
        if ext:
            filename_ext = "." + filename.split(".")[-1]
            if filename_ext.lower() not in accept_exts:
                filename += ext
        return filename

    def convert(self, input_string, parent=None):
        filename = self.preprocess(input_string)
        filename = self.validate(filename)
        filename = self.append_extension(filename)

        parent = parent or self.parent
        parent = self.validate(parent)
        if parent:
            filepath = self.output_root / parent / filename
        else:
            filepath = self.output_root / filename

        self.filename = filename
        self.filepath = filepath

        return self.filepath


class UrlToFilepathConverter(FilepathConverter):
    def __init__(self, parent: str = None):
        super().__init__(parent)
        self.output_root = self.output_root / "urls"

    def preprocess(self, url):
        filename = unquote(url.split("//")[1])
        return filename


class QueryToFilepathConverter(FilepathConverter):
    def __init__(self, parent: str = None):
        super().__init__(parent)
        self.output_root = self.output_root / "queries"


if __name__ == "__main__":
    query = "python 教程"
    query_converter = QueryToFilepathConverter()
    print(query_converter.convert(query))

    # url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
    url = (
        "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
    )

    url_converter = UrlToFilepathConverter(parent=query)
    print(url_converter.convert(url))