web-search-api / networks /network_configs.py
Hansimov's picture
:recycle: [Refactor] WebpageContentExtractor: Separate html and markdown processing
a636bcb
raw
history blame
430 Bytes
IGNORE_TAGS = ["script", "style", "button"]
IGNORE_CLASSES = [
"sidebar",
"footer",
"related",
"comment",
"topbar",
# "menu",
"offcanvas",
"navbar",
"post_side",
]
IGNORE_HOSTS = [
"weibo.com",
"hymson.com",
]
REQUESTS_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
}