Update app.py
Browse files
app.py
CHANGED
@@ -12,11 +12,8 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
|
|
12 |
error_message = ""
|
13 |
pegasus_output = ""
|
14 |
|
15 |
-
|
16 |
-
input_url = 'https://www.example.com/path/to/page'
|
17 |
-
|
18 |
domain_pattern = r'^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)'
|
19 |
-
|
20 |
matched = re.match(domain_pattern, input_url)
|
21 |
if matched:
|
22 |
domain = matched.group(1)
|
@@ -30,10 +27,10 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
|
|
30 |
replaced_url = "combined_output"
|
31 |
combined_output_filename = replaced_url + ".txt"
|
32 |
|
33 |
-
|
34 |
try:
|
|
|
|
|
35 |
combined_output_path = os.path.join(output_dir, combined_output_filename)
|
36 |
-
os.makedirs(os.path.dirname(combined_output_path), exist_ok=True)
|
37 |
with open(combined_output_path, "w") as file:
|
38 |
file.write("")
|
39 |
|
@@ -45,7 +42,7 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
|
|
45 |
exclude_keywords = exclude_keywords.split(",") if exclude_keywords else []
|
46 |
|
47 |
pegasus = Pegasus(
|
48 |
-
output_dir=
|
49 |
exclude_selectors=exclude_selectors,
|
50 |
include_domain=include_domain,
|
51 |
exclude_keywords=exclude_keywords,
|
@@ -77,13 +74,12 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
|
|
77 |
# ζ¨ζΊεΊεγε
γ«ζ»γ
|
78 |
sys.stdout = stdout_backup
|
79 |
|
80 |
-
txt_files = [f for f in os.listdir(
|
81 |
combined_text = ""
|
82 |
for f in txt_files:
|
83 |
-
with open(os.path.join(
|
84 |
combined_text += file.read()
|
85 |
|
86 |
-
# combined_output_path = os.path.join(output_dir, "combined_output.txt")
|
87 |
with open(combined_output_path, "w") as file:
|
88 |
file.write(combined_text)
|
89 |
|
|
|
12 |
error_message = ""
|
13 |
pegasus_output = ""
|
14 |
|
|
|
|
|
|
|
15 |
domain_pattern = r'^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)'
|
16 |
+
# URLγγγγ‘γ€γ³γζ½εΊ
|
17 |
matched = re.match(domain_pattern, input_url)
|
18 |
if matched:
|
19 |
domain = matched.group(1)
|
|
|
27 |
replaced_url = "combined_output"
|
28 |
combined_output_filename = replaced_url + ".txt"
|
29 |
|
|
|
30 |
try:
|
31 |
+
output_subdir = os.path.join(output_dir, include_domain)
|
32 |
+
os.makedirs(output_subdir, exist_ok=True)
|
33 |
combined_output_path = os.path.join(output_dir, combined_output_filename)
|
|
|
34 |
with open(combined_output_path, "w") as file:
|
35 |
file.write("")
|
36 |
|
|
|
42 |
exclude_keywords = exclude_keywords.split(",") if exclude_keywords else []
|
43 |
|
44 |
pegasus = Pegasus(
|
45 |
+
output_dir=output_subdir,
|
46 |
exclude_selectors=exclude_selectors,
|
47 |
include_domain=include_domain,
|
48 |
exclude_keywords=exclude_keywords,
|
|
|
74 |
# ζ¨ζΊεΊεγε
γ«ζ»γ
|
75 |
sys.stdout = stdout_backup
|
76 |
|
77 |
+
txt_files = [f for f in os.listdir(output_subdir) if f.endswith(output_extension)]
|
78 |
combined_text = ""
|
79 |
for f in txt_files:
|
80 |
+
with open(os.path.join(output_subdir, f), "r") as file:
|
81 |
combined_text += file.read()
|
82 |
|
|
|
83 |
with open(combined_output_path, "w") as file:
|
84 |
file.write(combined_text)
|
85 |
|