cyberandy commited on
Commit
2b3088a
·
verified ·
1 Parent(s): 23c0637

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -33
app.py CHANGED
@@ -57,46 +57,44 @@ def process_url(url, link_types):
57
  jsonl_path = f"{output_file}.jsonl"
58
 
59
  try:
60
- # Perform the crawl using advertools
61
  if not safe_crawl(url, jsonl_path):
62
  return "", "Crawl failed or timed out"
63
-
64
- # Read the crawl results
65
  crawl_df = pd.read_json(jsonl_path, lines=True)
66
-
67
- # Extract title and meta description
68
- title = crawl_df['title'].iloc[0] if not pd.isna(crawl_df['title'].iloc[0]) else "Untitled"
69
- meta_desc = crawl_df['meta_desc'].iloc[0] if not pd.isna(crawl_df['meta_desc'].iloc[0]) else ""
70
-
71
- all_links = []
72
-
73
- # Process links based on the selected types
74
- if link_types and "All links" not in link_types:
75
- for link_type in link_types:
76
- type_match = re.findall(r"header|footer|nav", link_type)
77
- if type_match:
78
- link_content = explode_link_df(crawl_df, type_match[0])
79
- if link_content:
80
- all_links.append(link_content)
81
- all_links.append('\n\n')
82
- else:
83
- # Process all links using advertools
84
- link_df = adv.crawlytics.links(crawl_df)
85
- for link, text in link_df[['link', 'text']].values:
86
- if text and text.strip():
87
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
88
- text = re.sub(r"\s{3,}", " ", text)
89
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
90
-
91
- # Generate the final Markdown content (llms.txt)
92
- links_text = "\n\n".join(all_links)
93
- final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
94
-
95
  finally:
96
- # Cleanup the temporary file
97
  if os.path.exists(jsonl_path):
98
  os.remove(jsonl_path)
 
 
 
 
 
 
 
 
 
 
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
101
 
102
  except Exception as e:
 
57
  jsonl_path = f"{output_file}.jsonl"
58
 
59
  try:
 
60
  if not safe_crawl(url, jsonl_path):
61
  return "", "Crawl failed or timed out"
 
 
62
  crawl_df = pd.read_json(jsonl_path, lines=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  finally:
 
64
  if os.path.exists(jsonl_path):
65
  os.remove(jsonl_path)
66
+
67
+ if crawl_df.empty:
68
+ return "", "Crawl produced no data for the URL."
69
+
70
+ # Use default values if the expected columns are missing or empty
71
+ title = "Untitled"
72
+ meta_desc = ""
73
+ if 'title' in crawl_df.columns and not pd.isna(crawl_df['title'].iloc[0]):
74
+ title = crawl_df['title'].iloc[0]
75
+ if 'meta_desc' in crawl_df.columns and not pd.isna(crawl_df['meta_desc'].iloc[0]):
76
+ meta_desc = crawl_df['meta_desc'].iloc[0]
77
 
78
+ all_links = []
79
+ if link_types and "All links" not in link_types:
80
+ for link_type in link_types:
81
+ type_match = re.findall(r"header|footer|nav", link_type)
82
+ if type_match:
83
+ link_content = explode_link_df(crawl_df, type_match[0])
84
+ if link_content:
85
+ all_links.append(link_content)
86
+ all_links.append('\n\n')
87
+ else:
88
+ # Process all links using advertools
89
+ link_df = adv.crawlytics.links(crawl_df)
90
+ for link, text in link_df[['link', 'text']].values:
91
+ if text and text.strip():
92
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
93
+ text = re.sub(r"\s{3,}", " ", text)
94
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
95
+
96
+ links_text = "\n\n".join(all_links)
97
+ final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
98
  return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
99
 
100
  except Exception as e: