Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 8 days ago

Commit

2b3088a

verified ·

1 Parent(s): 23c0637

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -33

app.py CHANGED Viewed

@@ -57,46 +57,44 @@ def process_url(url, link_types):
         jsonl_path = f"{output_file}.jsonl"
         try:
-            # Perform the crawl using advertools
             if not safe_crawl(url, jsonl_path):
                 return "", "Crawl failed or timed out"
-            # Read the crawl results
             crawl_df = pd.read_json(jsonl_path, lines=True)
-            # Extract title and meta description
-            title = crawl_df['title'].iloc[0] if not pd.isna(crawl_df['title'].iloc[0]) else "Untitled"
-            meta_desc = crawl_df['meta_desc'].iloc[0] if not pd.isna(crawl_df['meta_desc'].iloc[0]) else ""
-            all_links = []
-            # Process links based on the selected types
-            if link_types and "All links" not in link_types:
-                for link_type in link_types:
-                    type_match = re.findall(r"header|footer|nav", link_type)
-                    if type_match:
-                        link_content = explode_link_df(crawl_df, type_match[0])
-                        if link_content:
-                            all_links.append(link_content)
-                            all_links.append('\n\n')
-            else:
-                # Process all links using advertools
-                link_df = adv.crawlytics.links(crawl_df)
-                for link, text in link_df[['link', 'text']].values:
-                    if text and text.strip():
-                        text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
-                        text = re.sub(r"\s{3,}", " ", text)
-                        all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
-            # Generate the final Markdown content (llms.txt)
-            links_text = "\n\n".join(all_links)
-            final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
         finally:
-            # Cleanup the temporary file
             if os.path.exists(jsonl_path):
                 os.remove(jsonl_path)
         return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
     except Exception as e:

         jsonl_path = f"{output_file}.jsonl"
         try:
             if not safe_crawl(url, jsonl_path):
                 return "", "Crawl failed or timed out"
             crawl_df = pd.read_json(jsonl_path, lines=True)
         finally:
             if os.path.exists(jsonl_path):
                 os.remove(jsonl_path)
+        if crawl_df.empty:
+            return "", "Crawl produced no data for the URL."
+        # Use default values if the expected columns are missing or empty
+        title = "Untitled"
+        meta_desc = ""
+        if 'title' in crawl_df.columns and not pd.isna(crawl_df['title'].iloc[0]):
+            title = crawl_df['title'].iloc[0]
+        if 'meta_desc' in crawl_df.columns and not pd.isna(crawl_df['meta_desc'].iloc[0]):
+            meta_desc = crawl_df['meta_desc'].iloc[0]
+        all_links = []
+        if link_types and "All links" not in link_types:
+            for link_type in link_types:
+                type_match = re.findall(r"header|footer|nav", link_type)
+                if type_match:
+                    link_content = explode_link_df(crawl_df, type_match[0])
+                    if link_content:
+                        all_links.append(link_content)
+                        all_links.append('\n\n')
+        else:
+            # Process all links using advertools
+            link_df = adv.crawlytics.links(crawl_df)
+            for link, text in link_df[['link', 'text']].values:
+                if text and text.strip():
+                    text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
+                    text = re.sub(r"\s{3,}", " ", text)
+                    all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
+        links_text = "\n\n".join(all_links)
+        final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
         return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
     except Exception as e: