Hansimov commited on
Commit
64a0dbf
1 Parent(s): e448a74

:gem: [Feature] New FilepathConverter: convert urls and queries to valid file path

Browse files
Files changed (1) hide show
  1. networks/filepath_converter.py +104 -0
networks/filepath_converter.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+ import re
3
+ from pathlib import Path
4
+ from urllib.parse import quote
5
+
6
+
7
+ # What characters are forbidden in Windows and Linux directory names?
8
+ # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
9
+
10
+ INVALID_FILE_PATH_CHARS = [
11
+ "\\",
12
+ "/",
13
+ ":",
14
+ "*",
15
+ "?",
16
+ '"',
17
+ "<",
18
+ ">",
19
+ "|",
20
+ "\n",
21
+ "\t",
22
+ "\r",
23
+ *[chr(i) for i in range(32)],
24
+ ]
25
+
26
+ WINDOWS_INVALID_FILE_PATH_NAMES = [
27
+ "con",
28
+ "prn",
29
+ "aux",
30
+ "nul",
31
+ *[f"com{i+1}" for i in range(10)],
32
+ *[f"lpt{i+1}" for i in range(10)],
33
+ ]
34
+
35
+
36
+ class FilepathConverter:
37
+ def __init__(self):
38
+ self.output_root = Path(__file__).parents[1] / "files"
39
+
40
+ def preprocess(self, input_string):
41
+ return input_string
42
+
43
+ def validate(self, input_string):
44
+ filename = input_string
45
+ for char in INVALID_FILE_PATH_CHARS:
46
+ filename = filename.replace(char, "_")
47
+ if platform.system() == "Windows":
48
+ filename_base = filename.split(".")[0]
49
+ if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
50
+ filename_base = filename_base + "_"
51
+ filename = ".".join([filename_base, *filename.split(".")[1:]])
52
+ return filename
53
+
54
+ def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
55
+ if ext:
56
+ filename_ext = "." + filename.split(".")[-1]
57
+ if filename_ext.lower() not in accept_exts:
58
+ filename += ext
59
+ return filename
60
+
61
+ def convert(self, input_string, parent=None):
62
+ filename = self.preprocess(input_string)
63
+ filename = self.validate(filename)
64
+ filename = self.append_extension(filename)
65
+
66
+ if parent:
67
+ filepath = self.output_root / parent / filename
68
+ else:
69
+ filepath = self.output_root / filename
70
+
71
+ self.filename = filename
72
+ self.filepath = filepath
73
+
74
+ return {"filename": self.filename, "filepath": self.filepath}
75
+
76
+
77
+ class UrlToFilepathConverter(FilepathConverter):
78
+ def __init__(self):
79
+ super().__init__()
80
+ self.output_root = self.output_root / "urls"
81
+
82
+ def preprocess(self, url):
83
+ filename = url.split("//")[1]
84
+ return filename
85
+
86
+
87
+ class QueryToFilepathConverter(FilepathConverter):
88
+ def __init__(self):
89
+ super().__init__()
90
+ self.output_root = self.output_root / "queries"
91
+
92
+
93
+ if __name__ == "__main__":
94
+ query = "python 教程"
95
+ query_converter = QueryToFilepathConverter()
96
+ print(query_converter.convert(query)["filename"])
97
+
98
+ # url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
99
+ url = "https://www.bing.com/search?q=Bing+AI&showconv=1&setlang=en&cc=us"
100
+ # url = (
101
+ # "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
102
+ # )
103
+ url_converter = UrlToFilepathConverter()
104
+ print(url_converter.convert(url, parent=query)["filepath"])