tregu0458 commited on
Commit
8e1d1f4
Β·
verified Β·
1 Parent(s): b30439e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -10
app.py CHANGED
@@ -12,11 +12,8 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
12
  error_message = ""
13
  pegasus_output = ""
14
 
15
-
16
- input_url = 'https://www.example.com/path/to/page'
17
-
18
  domain_pattern = r'^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)'
19
- # URLγ‹γ‚‰γƒ‰γƒ‘γ‚€γƒ³γ‚’ζŠ½ε‡Ί
20
  matched = re.match(domain_pattern, input_url)
21
  if matched:
22
  domain = matched.group(1)
@@ -30,10 +27,10 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
30
  replaced_url = "combined_output"
31
  combined_output_filename = replaced_url + ".txt"
32
 
33
-
34
  try:
 
 
35
  combined_output_path = os.path.join(output_dir, combined_output_filename)
36
- os.makedirs(os.path.dirname(combined_output_path), exist_ok=True)
37
  with open(combined_output_path, "w") as file:
38
  file.write("")
39
 
@@ -45,7 +42,7 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
45
  exclude_keywords = exclude_keywords.split(",") if exclude_keywords else []
46
 
47
  pegasus = Pegasus(
48
- output_dir=output_dir,
49
  exclude_selectors=exclude_selectors,
50
  include_domain=include_domain,
51
  exclude_keywords=exclude_keywords,
@@ -77,13 +74,12 @@ def pegasus_web_crawler(input_url, output_dir, exclude_selectors, include_domain
77
  # ζ¨™ζΊ–ε‡ΊεŠ›γ‚’ε…ƒγ«ζˆ»γ™
78
  sys.stdout = stdout_backup
79
 
80
- txt_files = [f for f in os.listdir(os.path.join(output_dir, include_domain)) if f.endswith(output_extension)]
81
  combined_text = ""
82
  for f in txt_files:
83
- with open(os.path.join(output_dir, include_domain, f), "r") as file:
84
  combined_text += file.read()
85
 
86
- # combined_output_path = os.path.join(output_dir, "combined_output.txt")
87
  with open(combined_output_path, "w") as file:
88
  file.write(combined_text)
89
 
 
12
  error_message = ""
13
  pegasus_output = ""
14
 
 
 
 
15
  domain_pattern = r'^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)'
16
+ # URLγ‹γ‚‰γƒ‰γƒ‘γ‚€γƒ³γ‚’ζŠ½ε‡Ί
17
  matched = re.match(domain_pattern, input_url)
18
  if matched:
19
  domain = matched.group(1)
 
27
  replaced_url = "combined_output"
28
  combined_output_filename = replaced_url + ".txt"
29
 
 
30
  try:
31
+ output_subdir = os.path.join(output_dir, include_domain)
32
+ os.makedirs(output_subdir, exist_ok=True)
33
  combined_output_path = os.path.join(output_dir, combined_output_filename)
 
34
  with open(combined_output_path, "w") as file:
35
  file.write("")
36
 
 
42
  exclude_keywords = exclude_keywords.split(",") if exclude_keywords else []
43
 
44
  pegasus = Pegasus(
45
+ output_dir=output_subdir,
46
  exclude_selectors=exclude_selectors,
47
  include_domain=include_domain,
48
  exclude_keywords=exclude_keywords,
 
74
  # ζ¨™ζΊ–ε‡ΊεŠ›γ‚’ε…ƒγ«ζˆ»γ™
75
  sys.stdout = stdout_backup
76
 
77
+ txt_files = [f for f in os.listdir(output_subdir) if f.endswith(output_extension)]
78
  combined_text = ""
79
  for f in txt_files:
80
+ with open(os.path.join(output_subdir, f), "r") as file:
81
  combined_text += file.read()
82
 
 
83
  with open(combined_output_path, "w") as file:
84
  file.write(combined_text)
85