Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -34,16 +34,40 @@ class WebsiteCrawler:
|
|
34 |
# Normalize unicode characters
|
35 |
text = unicodedata.normalize('NFKD', text)
|
36 |
# Replace special quotes and dashes with standard characters
|
37 |
-
# Replace fancy quotes and dashes with standard ones
|
38 |
text = text.replace('\u201c', '"').replace('\u201d', '"') # smart quotes
|
39 |
text = text.replace('\u2018', "'").replace('\u2019', "'") # smart single quotes
|
40 |
text = text.replace('\u2013', '-').replace('\u2014', '-') # en and em dashes
|
41 |
# Remove any remaining non-ASCII characters
|
42 |
text = text.encode('ascii', 'ignore').decode('ascii')
|
43 |
-
# Clean up extra whitespace
|
44 |
text = ' '.join(text.split())
|
45 |
return text
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def is_valid_url(self, url, base_domain):
|
48 |
"""Check if URL is valid and belongs to the same domain"""
|
49 |
try:
|
@@ -76,42 +100,41 @@ class WebsiteCrawler:
|
|
76 |
'category': 'Optional'
|
77 |
}
|
78 |
|
79 |
-
# Title extraction with
|
80 |
title = (
|
81 |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
|
82 |
soup.find('title').text if soup.find('title') else
|
83 |
soup.find('h1').text if soup.find('h1') else
|
84 |
url.split('/')[-1]
|
85 |
)
|
86 |
-
metadata['title'] = self.
|
87 |
|
88 |
-
# Description extraction with
|
89 |
description = (
|
90 |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
|
91 |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
|
92 |
""
|
93 |
)
|
94 |
-
metadata['description'] = self.
|
95 |
|
96 |
-
# Calculate importance
|
97 |
-
|
98 |
-
if 'docs' in
|
99 |
-
importance
|
100 |
metadata['category'] = 'Docs'
|
101 |
-
|
102 |
-
importance
|
103 |
metadata['category'] = 'API'
|
104 |
-
|
105 |
-
importance
|
106 |
metadata['category'] = 'Guides'
|
107 |
-
|
108 |
-
importance
|
109 |
metadata['category'] = 'Examples'
|
110 |
-
|
111 |
-
importance
|
112 |
metadata['category'] = 'Blog'
|
113 |
|
114 |
-
metadata['importance'] = importance
|
115 |
return metadata
|
116 |
|
117 |
async def crawl_page(self, url, depth, base_domain):
|
@@ -121,7 +144,7 @@ class WebsiteCrawler:
|
|
121 |
|
122 |
try:
|
123 |
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
124 |
-
response.encoding = 'utf-8'
|
125 |
response.raise_for_status()
|
126 |
self.visited_urls.add(url)
|
127 |
|
@@ -173,6 +196,9 @@ class WebsiteCrawler:
|
|
173 |
reverse=True
|
174 |
)
|
175 |
|
|
|
|
|
|
|
176 |
# Group URLs by category
|
177 |
categorized_urls = defaultdict(list)
|
178 |
for url, metadata in sorted_urls:
|
@@ -182,24 +208,22 @@ class WebsiteCrawler:
|
|
182 |
content = []
|
183 |
|
184 |
# Add main title and description
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
content.append(f"> {main_metadata['description']}
|
189 |
|
190 |
# Add categorized sections
|
191 |
priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
|
192 |
|
193 |
for category in priority_order:
|
194 |
if category in categorized_urls:
|
195 |
-
content.append(f"\n## {category}
|
196 |
for url, metadata in categorized_urls[category]:
|
197 |
-
|
198 |
-
|
199 |
-
if desc:
|
200 |
-
content.append(f"- [{title}]({url}): {desc[:100]}...\n")
|
201 |
else:
|
202 |
-
content.append(f"- [{title}]({url})
|
203 |
|
204 |
return "\n".join(content)
|
205 |
|
@@ -223,73 +247,92 @@ async def process_url(url, max_depth, max_pages):
|
|
223 |
await crawler.crawl_website(url)
|
224 |
content = crawler.generate_llms_txt()
|
225 |
|
226 |
-
return content, f"Successfully crawled {len(crawler.visited_urls)} pages.
|
227 |
|
228 |
except Exception as e:
|
229 |
return "", f"Error: {str(e)}"
|
230 |
|
231 |
-
# Create
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
font-family: 'Open Sans', sans-serif !important;
|
237 |
-
}
|
238 |
-
|
239 |
-
.gr-box {
|
240 |
-
border-radius: 8px !important;
|
241 |
-
border: 1px solid #e5e7eb !important;
|
242 |
-
}
|
243 |
-
|
244 |
-
.gr-button {
|
245 |
-
font-family: 'Open Sans', sans-serif !important;
|
246 |
-
font-weight: 600 !important;
|
247 |
-
}
|
248 |
-
|
249 |
-
.gr-input {
|
250 |
-
font-family: 'Open Sans', sans-serif !important;
|
251 |
-
}
|
252 |
-
"""
|
253 |
|
254 |
# Create the Gradio interface
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
label="Website URL",
|
260 |
placeholder="Enter the website URL (e.g., example.com or https://example.com)",
|
261 |
info="The URL will be automatically prefixed with https:// if no protocol is specified."
|
262 |
-
),
|
263 |
-
gr.Slider(
|
264 |
-
minimum=1,
|
265 |
-
maximum=5,
|
266 |
-
value=3,
|
267 |
-
step=1,
|
268 |
-
label="Maximum Crawl Depth",
|
269 |
-
info="Higher values will result in more thorough but slower crawling"
|
270 |
-
),
|
271 |
-
gr.Slider(
|
272 |
-
minimum=10,
|
273 |
-
maximum=100,
|
274 |
-
value=50,
|
275 |
-
step=10,
|
276 |
-
label="Maximum Pages to Crawl",
|
277 |
-
info="Higher values will result in more comprehensive but slower results"
|
278 |
)
|
279 |
-
|
280 |
-
|
281 |
-
gr.
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
# Launch the app
|
295 |
if __name__ == "__main__":
|
|
|
34 |
# Normalize unicode characters
|
35 |
text = unicodedata.normalize('NFKD', text)
|
36 |
# Replace special quotes and dashes with standard characters
|
|
|
37 |
text = text.replace('\u201c', '"').replace('\u201d', '"') # smart quotes
|
38 |
text = text.replace('\u2018', "'").replace('\u2019', "'") # smart single quotes
|
39 |
text = text.replace('\u2013', '-').replace('\u2014', '-') # en and em dashes
|
40 |
# Remove any remaining non-ASCII characters
|
41 |
text = text.encode('ascii', 'ignore').decode('ascii')
|
42 |
+
# Clean up extra whitespace and ensure proper sentence spacing
|
43 |
text = ' '.join(text.split())
|
44 |
return text
|
45 |
|
46 |
+
def clean_title(self, title):
|
47 |
+
"""Clean and format titles"""
|
48 |
+
title = self.normalize_text(title)
|
49 |
+
# Remove common suffixes
|
50 |
+
title = re.sub(r'\s*\|\s*.*$', '', title) # Remove pipe and everything after
|
51 |
+
title = re.sub(r'\s*-\s*.*$', '', title) # Remove dash and everything after
|
52 |
+
title = title.strip()
|
53 |
+
return title
|
54 |
+
|
55 |
+
def clean_description(self, desc):
|
56 |
+
"""Clean and format descriptions"""
|
57 |
+
if not desc:
|
58 |
+
return ""
|
59 |
+
desc = self.normalize_text(desc)
|
60 |
+
# Find the last complete sentence
|
61 |
+
sentences = re.split(r'(?<=[.!?])\s+', desc)
|
62 |
+
if sentences:
|
63 |
+
# Take up to two complete sentences
|
64 |
+
cleaned_desc = ' '.join(sentences[:2]).strip()
|
65 |
+
# Ensure it ends with proper punctuation
|
66 |
+
if not cleaned_desc[-1] in '.!?':
|
67 |
+
cleaned_desc += '.'
|
68 |
+
return cleaned_desc
|
69 |
+
return desc
|
70 |
+
|
71 |
def is_valid_url(self, url, base_domain):
|
72 |
"""Check if URL is valid and belongs to the same domain"""
|
73 |
try:
|
|
|
100 |
'category': 'Optional'
|
101 |
}
|
102 |
|
103 |
+
# Title extraction with cleaning
|
104 |
title = (
|
105 |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
|
106 |
soup.find('title').text if soup.find('title') else
|
107 |
soup.find('h1').text if soup.find('h1') else
|
108 |
url.split('/')[-1]
|
109 |
)
|
110 |
+
metadata['title'] = self.clean_title(title)
|
111 |
|
112 |
+
# Description extraction with cleaning
|
113 |
description = (
|
114 |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
|
115 |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
|
116 |
""
|
117 |
)
|
118 |
+
metadata['description'] = self.clean_description(description)
|
119 |
|
120 |
+
# Calculate importance and category
|
121 |
+
url_lower = url.lower()
|
122 |
+
if 'docs' in url_lower or 'documentation' in url_lower:
|
123 |
+
metadata['importance'] = 5
|
124 |
metadata['category'] = 'Docs'
|
125 |
+
elif 'api' in url_lower:
|
126 |
+
metadata['importance'] = 4
|
127 |
metadata['category'] = 'API'
|
128 |
+
elif 'guide' in url_lower or 'tutorial' in url_lower:
|
129 |
+
metadata['importance'] = 3
|
130 |
metadata['category'] = 'Guides'
|
131 |
+
elif 'example' in url_lower:
|
132 |
+
metadata['importance'] = 2
|
133 |
metadata['category'] = 'Examples'
|
134 |
+
elif 'blog' in url_lower:
|
135 |
+
metadata['importance'] = 1
|
136 |
metadata['category'] = 'Blog'
|
137 |
|
|
|
138 |
return metadata
|
139 |
|
140 |
async def crawl_page(self, url, depth, base_domain):
|
|
|
144 |
|
145 |
try:
|
146 |
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
147 |
+
response.encoding = 'utf-8'
|
148 |
response.raise_for_status()
|
149 |
self.visited_urls.add(url)
|
150 |
|
|
|
196 |
reverse=True
|
197 |
)
|
198 |
|
199 |
+
if not sorted_urls:
|
200 |
+
return "No content was found to generate llms.txt"
|
201 |
+
|
202 |
# Group URLs by category
|
203 |
categorized_urls = defaultdict(list)
|
204 |
for url, metadata in sorted_urls:
|
|
|
208 |
content = []
|
209 |
|
210 |
# Add main title and description
|
211 |
+
main_metadata = sorted_urls[0][1]
|
212 |
+
content.append(f"# {main_metadata['title']}")
|
213 |
+
if main_metadata['description']:
|
214 |
+
content.append(f"\n> {main_metadata['description']}")
|
215 |
|
216 |
# Add categorized sections
|
217 |
priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
|
218 |
|
219 |
for category in priority_order:
|
220 |
if category in categorized_urls:
|
221 |
+
content.append(f"\n## {category}")
|
222 |
for url, metadata in categorized_urls[category]:
|
223 |
+
if metadata['description']:
|
224 |
+
content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
|
|
|
|
|
225 |
else:
|
226 |
+
content.append(f"\n- [{metadata['title']}]({url})")
|
227 |
|
228 |
return "\n".join(content)
|
229 |
|
|
|
247 |
await crawler.crawl_website(url)
|
248 |
content = crawler.generate_llms_txt()
|
249 |
|
250 |
+
return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
|
251 |
|
252 |
except Exception as e:
|
253 |
return "", f"Error: {str(e)}"
|
254 |
|
255 |
+
# Create custom theme with Open Sans
|
256 |
+
theme = gr.themes.Soft().set(
|
257 |
+
font=["Open Sans", "ui-sans-serif", "system-ui", "sans-serif"],
|
258 |
+
font_mono=["IBM Plex Mono", "ui-monospace", "monospace"]
|
259 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
# Create the Gradio interface
|
262 |
+
with gr.Blocks(theme=theme, css="""
|
263 |
+
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
|
264 |
+
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono&display=swap');
|
265 |
+
|
266 |
+
.gradio-container {
|
267 |
+
font-family: 'Open Sans', sans-serif !important;
|
268 |
+
}
|
269 |
+
|
270 |
+
.gr-box {
|
271 |
+
border-radius: 8px !important;
|
272 |
+
border: 1px solid #e5e7eb !important;
|
273 |
+
}
|
274 |
+
|
275 |
+
.gr-button {
|
276 |
+
font-family: 'Open Sans', sans-serif !important;
|
277 |
+
font-weight: 600 !important;
|
278 |
+
}
|
279 |
+
|
280 |
+
.gr-input {
|
281 |
+
font-family: 'Open Sans', sans-serif !important;
|
282 |
+
}
|
283 |
+
|
284 |
+
.monospace {
|
285 |
+
font-family: 'IBM Plex Mono', monospace !important;
|
286 |
+
}
|
287 |
+
""") as iface:
|
288 |
+
gr.Markdown("# llms.txt Generator")
|
289 |
+
gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
|
290 |
+
|
291 |
+
with gr.Row():
|
292 |
+
url_input = gr.Textbox(
|
293 |
label="Website URL",
|
294 |
placeholder="Enter the website URL (e.g., example.com or https://example.com)",
|
295 |
info="The URL will be automatically prefixed with https:// if no protocol is specified."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
)
|
297 |
+
|
298 |
+
with gr.Row():
|
299 |
+
with gr.Column():
|
300 |
+
depth_input = gr.Slider(
|
301 |
+
minimum=1,
|
302 |
+
maximum=5,
|
303 |
+
value=3,
|
304 |
+
step=1,
|
305 |
+
label="Maximum Crawl Depth",
|
306 |
+
info="Higher values will result in more thorough but slower crawling"
|
307 |
+
)
|
308 |
+
with gr.Column():
|
309 |
+
pages_input = gr.Slider(
|
310 |
+
minimum=10,
|
311 |
+
maximum=100,
|
312 |
+
value=50,
|
313 |
+
step=10,
|
314 |
+
label="Maximum Pages to Crawl",
|
315 |
+
info="Higher values will result in more comprehensive but slower results"
|
316 |
+
)
|
317 |
+
|
318 |
+
generate_btn = gr.Button("Generate llms.txt", variant="primary")
|
319 |
+
|
320 |
+
output = gr.Textbox(
|
321 |
+
label="Generated llms.txt Content",
|
322 |
+
lines=20,
|
323 |
+
max_lines=30,
|
324 |
+
show_copy_button=True,
|
325 |
+
container=False,
|
326 |
+
scale=2
|
327 |
+
)
|
328 |
+
|
329 |
+
status = gr.Textbox(label="Status")
|
330 |
+
|
331 |
+
generate_btn.click(
|
332 |
+
fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
|
333 |
+
inputs=[url_input, depth_input, pages_input],
|
334 |
+
outputs=[output, status]
|
335 |
+
)
|
336 |
|
337 |
# Launch the app
|
338 |
if __name__ == "__main__":
|