Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -231,7 +231,7 @@ class WebsiteCrawler:
|
|
231 |
desc = self.clean_text(desc) if desc else ""
|
232 |
|
233 |
# Skip if it's duplicate content
|
234 |
-
if self.is_duplicate_content(desc, title):
|
235 |
return []
|
236 |
|
237 |
# Determine category and importance
|
@@ -294,8 +294,8 @@ class WebsiteCrawler:
|
|
294 |
self.homepage_metadata = {
|
295 |
"site_name": urlparse(url).netloc.split('.')[0].capitalize(),
|
296 |
"description": None
|
297 |
-
}
|
298 |
-
|
299 |
async def crawl_website(self, start_url):
|
300 |
"""Crawl website starting from the given URL"""
|
301 |
# First process the homepage
|
|
|
231 |
desc = self.clean_text(desc) if desc else ""
|
232 |
|
233 |
# Skip if it's duplicate content
|
234 |
+
if self.is_duplicate_content(desc, title, url):
|
235 |
return []
|
236 |
|
237 |
# Determine category and importance
|
|
|
294 |
self.homepage_metadata = {
|
295 |
"site_name": urlparse(url).netloc.split('.')[0].capitalize(),
|
296 |
"description": None
|
297 |
+
}
|
298 |
+
|
299 |
async def crawl_website(self, start_url):
|
300 |
"""Crawl website starting from the given URL"""
|
301 |
# First process the homepage
|