pabloce commited on
Commit
97d4ad1
·
verified ·
1 Parent(s): 7702d59

Delete default_web_crawlers.py

Browse files
Files changed (1) hide show
  1. default_web_crawlers.py +0 -30
default_web_crawlers.py DELETED
@@ -1,30 +0,0 @@
1
- import json
2
-
3
- from web_search_interfaces import WebCrawler
4
- from trafilatura import fetch_url, extract
5
-
6
-
7
- class TrafilaturaWebCrawler(WebCrawler):
8
- def get_website_content_from_url(self, url: str) -> str:
9
- """
10
- Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
11
-
12
- Args:
13
- url (str): URL to get website content from.
14
-
15
- Returns:
16
- str: Extracted content including title, main text, and tables.
17
- """
18
-
19
- try:
20
- downloaded = fetch_url(url)
21
-
22
- result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
23
-
24
- if result:
25
- result = json.loads(result)
26
- return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
27
- else:
28
- return ""
29
- except Exception as e:
30
- return f"An error occurred: {str(e)}"