cyberandy commited on
Commit
3bc0d96
1 Parent(s): 6682c58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -119
app.py CHANGED
@@ -9,23 +9,36 @@ import os
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- def explode_link_df(crawl_df, col_group):
13
- """Process links from a specific column group in the crawl dataframe"""
14
  try:
15
- link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
16
- text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
17
- all_links = []
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- for link, text in zip(link.dropna(), text.dropna()):
20
- if text and text.strip():
21
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
22
- text = re.sub(r"\s{3,}", " ", text)
23
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
 
 
24
 
25
- return "\n\n".join(all_links)
26
  except Exception as e:
27
- logger.error(f"Error processing {col_group} links: {str(e)}")
28
- return ""
29
 
30
  def process_url(url, link_types):
31
  """Process URL and generate llms.txt content"""
@@ -33,131 +46,87 @@ def process_url(url, link_types):
33
  return "", "Please enter a URL"
34
 
35
  try:
36
- if not url.startswith(("http://", "https://")):
37
- url = "https://" + url
38
-
39
- # Generate unique filename for this crawl
40
- output_file = token_hex(6)
41
- jsonl_path = f"{output_file}.jsonl"
42
-
43
  try:
44
- # Perform the crawl using advertools
45
- adv.crawl(url, jsonl_path)
 
46
 
47
- # Read the crawl results
48
- crawl_df = pd.read_json(jsonl_path, lines=True)
49
 
50
- # Extract title and meta description
51
- title = crawl_df['title'].values[0]
52
- meta_desc = crawl_df['meta_desc'].values[0]
53
 
 
 
54
  all_links = []
55
 
56
- # Process links based on selected types
57
  if link_types and "All links" not in link_types:
58
  for link_type in link_types:
59
- type_match = re.findall(r"header|footer|nav", link_type)
60
- if type_match:
61
- link_content = explode_link_df(crawl_df, type_match[0])
62
- if link_content:
63
- all_links.append(link_content)
64
- all_links.append('\n\n')
65
  else:
66
- # Process all links using advertools
67
- link_df = adv.crawlytics.links(crawl_df)
68
- for link, text in link_df[['link', 'text']].values:
69
- if text and text.strip():
70
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
71
- text = re.sub(r"\s{3,}", " ", text)
72
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
73
-
74
- # Generate final content with proper spacing
75
- links_text = "\n\n".join(all_links)
76
- final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
77
-
78
  finally:
79
- # Cleanup temporary file
80
- if os.path.exists(jsonl_path):
81
- os.remove(jsonl_path)
82
-
83
- return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
84
-
85
  except Exception as e:
86
- logger.error(f"Error processing URL {url}: {str(e)}")
87
  return "", f"Error: {str(e)}"
88
 
89
- # Custom CSS for Open Sans font and color theme
90
- css = """
91
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
92
-
93
- body {
94
- font-family: 'Open Sans', sans-serif !important;
95
- }
96
-
97
- .primary-btn {
98
- background-color: #3452db !important;
99
- }
100
-
101
- .primary-btn:hover {
102
- background-color: #2a41af !important;
103
- }
104
- """
105
-
106
- # Create custom theme with specific color
107
- theme = gr.themes.Soft(
108
- primary_hue=gr.themes.colors.Color(
109
- name="blue",
110
- c50="#eef1ff",
111
- c100="#e0e5ff",
112
- c200="#c3cbff",
113
- c300="#a5b2ff",
114
- c400="#8798ff",
115
- c500="#6a7eff",
116
- c600="#3452db", # Main color
117
- c700="#2a41af",
118
- c800="#1f3183",
119
- c900="#152156",
120
- c950="#0a102b",
121
- )
122
- )
123
-
124
- with gr.Blocks(theme=theme, css=css) as iface:
125
- with gr.Row():
126
- gr.Markdown("# Generate an `llms.txt` file")
127
-
128
- with gr.Row():
129
- url_input = gr.Textbox(
130
- label="Enter the home page of a website:",
131
- placeholder="example: https://example.com",
132
- lines=1,
133
- )
134
-
135
- with gr.Row():
136
- link_types = gr.Dropdown(
137
- label="Select types of links to extract (leave empty to get all links)",
138
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
 
139
  multiselect=True,
140
  value=["All links"]
141
  )
142
-
143
- with gr.Row():
144
- generate_btn = gr.Button("Submit", variant="primary", elem_classes=["primary-btn"])
145
-
146
- with gr.Row():
147
- output = gr.Textbox(
148
- label="Generated llms.txt Content",
149
  lines=20,
150
- show_copy_button=True,
151
- container=True,
 
 
152
  )
153
- status = gr.Textbox(label="Status", interactive=False)
154
-
155
- # Set up the click event
156
- generate_btn.click(
157
- fn=process_url,
158
- inputs=[url_input, link_types],
159
- outputs=[output, status],
160
- )
161
 
162
  if __name__ == "__main__":
163
  iface.launch()
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ def safe_crawl(url, output_file):
13
+ """Safely perform web crawl with timeout"""
14
  try:
15
+ adv.crawl(url, output_file,
16
+ follow_links=False, # Only crawl the main page
17
+ custom_settings={'CLOSESPIDER_TIMEOUT': 30}) # 30 second timeout
18
+ return True
19
+ except Exception as e:
20
+ logger.error(f"Crawl error: {str(e)}")
21
+ return False
22
+
23
+ def process_links(df, link_type=None):
24
+ """Process links based on type"""
25
+ try:
26
+ if link_type:
27
+ mask = df['source'].str.contains(f'<{link_type}', case=False, na=False)
28
+ df = df[mask]
29
 
30
+ all_links = []
31
+ for _, row in df.iterrows():
32
+ if row['text'] and str(row['text']).strip():
33
+ text = str(row['text']).strip()
34
+ text = re.sub(r'\s+', ' ', text)
35
+ link = str(row['link']).strip()
36
+ all_links.append(f"## {text}\n[{text}]({link})")
37
 
38
+ return all_links
39
  except Exception as e:
40
+ logger.error(f"Link processing error: {str(e)}")
41
+ return []
42
 
43
  def process_url(url, link_types):
44
  """Process URL and generate llms.txt content"""
 
46
  return "", "Please enter a URL"
47
 
48
  try:
49
+ # Ensure URL has protocol
50
+ if not url.startswith(('http://', 'https://')):
51
+ url = 'https://' + url
52
+
53
+ # Create temporary file
54
+ output_file = f"{token_hex(4)}.jsonl"
55
+
56
  try:
57
+ # Perform crawl
58
+ if not safe_crawl(url, output_file):
59
+ return "", "Crawl failed or timed out"
60
 
61
+ # Read results
62
+ df = pd.read_json(output_file, lines=True)
63
 
64
+ # Get basic info
65
+ title = df['title'].iloc[0] if not pd.isna(df['title'].iloc[0]) else "Untitled"
66
+ meta_desc = df['meta_desc'].iloc[0] if not pd.isna(df['meta_desc'].iloc[0]) else ""
67
 
68
+ # Process links
69
+ link_df = adv.crawlytics.links(df)
70
  all_links = []
71
 
 
72
  if link_types and "All links" not in link_types:
73
  for link_type in link_types:
74
+ type_name = re.search(r'<(\w+)>', link_type)
75
+ if type_name:
76
+ links = process_links(link_df, type_name.group(1))
77
+ all_links.extend(links)
 
 
78
  else:
79
+ all_links = process_links(link_df)
80
+
81
+ # Create content
82
+ content_parts = [
83
+ f"# {title}",
84
+ f"> {meta_desc}",
85
+ "\n\n".join(all_links)
86
+ ]
87
+ final_content = "\n\n".join(content_parts)
88
+
89
+ return final_content, f"Found {len(all_links)} links"
90
+
91
  finally:
92
+ # Cleanup
93
+ if os.path.exists(output_file):
94
+ os.remove(output_file)
95
+
 
 
96
  except Exception as e:
97
+ logger.error(f"Error processing {url}: {str(e)}")
98
  return "", f"Error: {str(e)}"
99
 
100
+ # Create interface
101
+ iface = gr.Interface(
102
+ fn=process_url,
103
+ inputs=[
104
+ gr.Textbox(
105
+ label="Enter website URL",
106
+ placeholder="example: example.com"
107
+ ),
108
+ gr.Dropdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
110
+ label="Select link types",
111
  multiselect=True,
112
  value=["All links"]
113
  )
114
+ ],
115
+ outputs=[
116
+ gr.Textbox(
117
+ label="Generated llms.txt",
 
 
 
118
  lines=20,
119
+ show_copy_button=True
120
+ ),
121
+ gr.Textbox(
122
+ label="Status"
123
  )
124
+ ],
125
+ title="LLMs.txt Generator",
126
+ description="Generate an llms.txt file from a website",
127
+ theme=gr.themes.Soft(),
128
+ allow_flagging="never"
129
+ )
 
 
130
 
131
  if __name__ == "__main__":
132
  iface.launch()