Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -57,46 +57,44 @@ def process_url(url, link_types):
|
|
57 |
jsonl_path = f"{output_file}.jsonl"
|
58 |
|
59 |
try:
|
60 |
-
# Perform the crawl using advertools
|
61 |
if not safe_crawl(url, jsonl_path):
|
62 |
return "", "Crawl failed or timed out"
|
63 |
-
|
64 |
-
# Read the crawl results
|
65 |
crawl_df = pd.read_json(jsonl_path, lines=True)
|
66 |
-
|
67 |
-
# Extract title and meta description
|
68 |
-
title = crawl_df['title'].iloc[0] if not pd.isna(crawl_df['title'].iloc[0]) else "Untitled"
|
69 |
-
meta_desc = crawl_df['meta_desc'].iloc[0] if not pd.isna(crawl_df['meta_desc'].iloc[0]) else ""
|
70 |
-
|
71 |
-
all_links = []
|
72 |
-
|
73 |
-
# Process links based on the selected types
|
74 |
-
if link_types and "All links" not in link_types:
|
75 |
-
for link_type in link_types:
|
76 |
-
type_match = re.findall(r"header|footer|nav", link_type)
|
77 |
-
if type_match:
|
78 |
-
link_content = explode_link_df(crawl_df, type_match[0])
|
79 |
-
if link_content:
|
80 |
-
all_links.append(link_content)
|
81 |
-
all_links.append('\n\n')
|
82 |
-
else:
|
83 |
-
# Process all links using advertools
|
84 |
-
link_df = adv.crawlytics.links(crawl_df)
|
85 |
-
for link, text in link_df[['link', 'text']].values:
|
86 |
-
if text and text.strip():
|
87 |
-
text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
|
88 |
-
text = re.sub(r"\s{3,}", " ", text)
|
89 |
-
all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
|
90 |
-
|
91 |
-
# Generate the final Markdown content (llms.txt)
|
92 |
-
links_text = "\n\n".join(all_links)
|
93 |
-
final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
|
94 |
-
|
95 |
finally:
|
96 |
-
# Cleanup the temporary file
|
97 |
if os.path.exists(jsonl_path):
|
98 |
os.remove(jsonl_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
|
101 |
|
102 |
except Exception as e:
|
|
|
57 |
jsonl_path = f"{output_file}.jsonl"
|
58 |
|
59 |
try:
|
|
|
60 |
if not safe_crawl(url, jsonl_path):
|
61 |
return "", "Crawl failed or timed out"
|
|
|
|
|
62 |
crawl_df = pd.read_json(jsonl_path, lines=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
finally:
|
|
|
64 |
if os.path.exists(jsonl_path):
|
65 |
os.remove(jsonl_path)
|
66 |
+
|
67 |
+
if crawl_df.empty:
|
68 |
+
return "", "Crawl produced no data for the URL."
|
69 |
+
|
70 |
+
# Use default values if the expected columns are missing or empty
|
71 |
+
title = "Untitled"
|
72 |
+
meta_desc = ""
|
73 |
+
if 'title' in crawl_df.columns and not pd.isna(crawl_df['title'].iloc[0]):
|
74 |
+
title = crawl_df['title'].iloc[0]
|
75 |
+
if 'meta_desc' in crawl_df.columns and not pd.isna(crawl_df['meta_desc'].iloc[0]):
|
76 |
+
meta_desc = crawl_df['meta_desc'].iloc[0]
|
77 |
|
78 |
+
all_links = []
|
79 |
+
if link_types and "All links" not in link_types:
|
80 |
+
for link_type in link_types:
|
81 |
+
type_match = re.findall(r"header|footer|nav", link_type)
|
82 |
+
if type_match:
|
83 |
+
link_content = explode_link_df(crawl_df, type_match[0])
|
84 |
+
if link_content:
|
85 |
+
all_links.append(link_content)
|
86 |
+
all_links.append('\n\n')
|
87 |
+
else:
|
88 |
+
# Process all links using advertools
|
89 |
+
link_df = adv.crawlytics.links(crawl_df)
|
90 |
+
for link, text in link_df[['link', 'text']].values:
|
91 |
+
if text and text.strip():
|
92 |
+
text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
|
93 |
+
text = re.sub(r"\s{3,}", " ", text)
|
94 |
+
all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
|
95 |
+
|
96 |
+
links_text = "\n\n".join(all_links)
|
97 |
+
final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
|
98 |
return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
|
99 |
|
100 |
except Exception as e:
|