Spaces:
Paused
Paused
| """ | |
| RandomWeb β Configuration | |
| Loads environment variables and defines constants for all workers. | |
| """ | |
| import os | |
| # βββ Supabase ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SUPABASE_URL = os.getenv("SUPABASE_URL", "") | |
| SUPABASE_SECRET_KEY = os.getenv("SUPABASE_SECRET_KEY", "") | |
| SUPABASE_PUBLISHABLE_KEY = os.getenv("SUPABASE_PUBLISHABLE_KEY", "") | |
| # βββ Crawler Settings βββββββββββββββββββββββββββββββββββββββ | |
| USER_AGENT = "RandomWeb/1.0 (+https://github.com/guestcoder0906/RandomWeb; polite-bot)" | |
| REQUEST_TIMEOUT = 10 # seconds | |
| MAX_GLOBAL_CONCURRENCY = 20 # max simultaneous outbound connections | |
| PER_DOMAIN_RATE_LIMIT = 1.0 # requests per second per domain | |
| CRAWL_DELAY_DEFAULT = 1.0 # fallback crawl delay if robots.txt doesn't specify | |
| MAX_CRAWL_DEPTH = 3 # BFS depth limit per seed | |
| MAX_LINKS_PER_PAGE = 50 # max links to extract per page | |
| MAX_QUEUE_SIZE = 100_000 # max URLs in crawler queue | |
| # βββ Validator Settings ββββββββββββββββββββββββββββββββββββββ | |
| VALIDATION_BATCH_SIZE = 50 # URLs per validation batch | |
| VALIDATION_CONCURRENCY = 10 # concurrent validation requests | |
| RECHECK_INTERVAL_DAYS = 365 # re-verify every year | |
| # βββ CertStream ββββββββββββββββββββββββββββββββββββββββββββββ | |
| CERTSTREAM_URL = "wss://certstream.calidog.io/" | |
| CT_LOG_BATCH_SIZE = 100 # queue batch size before flushing to validation | |
| CT_LOG_RECONNECT_DELAY = 5 # initial reconnect delay in seconds | |
| CT_LOG_MAX_RECONNECT_DELAY = 300 # max reconnect delay | |
| # βββ Common Crawl ββββββββββββββββββββββββββββββββββββββββββββ | |
| COMMON_CRAWL_INDEX_URL = "https://index.commoncrawl.org/collinfo.json" | |
| COMMON_CRAWL_SAMPLE_SIZE = 10_000 # URLs per crawl import batch | |
| COMMON_CRAWL_RESCAN_HOURS = 168 # re-import weekly (7 * 24) | |
| # βββ Scheduler βββββββββββββββββββββββββββββββββββββββββββββββ | |
| SCHEDULER_INTERVAL_SECONDS = 3600 # run re-verification check every hour | |
| SCHEDULER_BATCH_SIZE = 100 # URLs per re-verification batch | |
| # βββ Blocked TLDs / Patterns ββββββββββββββββββββββββββββββββ | |
| BLOCKED_TLDS = { | |
| ".local", ".internal", ".test", ".example", | |
| ".invalid", ".localhost", ".onion", | |
| } | |
| # βββ Top 100 Seed Websites (SFW only) βββββββββββββββββββββββ | |
| SEED_WEBSITES = [ | |
| "https://google.com", | |
| "https://youtube.com", | |
| "https://facebook.com", | |
| "https://instagram.com", | |
| "https://chatgpt.com", | |
| "https://x.com", | |
| "https://reddit.com", | |
| "https://wikipedia.org", | |
| "https://whatsapp.com", | |
| "https://bing.com", | |
| "https://tiktok.com", | |
| "https://yahoo.co.jp", | |
| "https://yandex.ru", | |
| "https://yahoo.com", | |
| "https://amazon.com", | |
| "https://gemini.google.com", | |
| "https://linkedin.com", | |
| "https://baidu.com", | |
| "https://naver.com", | |
| "https://netflix.com", | |
| "https://pinterest.com", | |
| "https://live.com", | |
| "https://bilibili.com", | |
| "https://temu.com", | |
| "https://dzen.ru", | |
| "https://office.com", | |
| "https://microsoft.com", | |
| "https://twitch.tv", | |
| "https://canva.com", | |
| "https://weather.com", | |
| "https://vk.com", | |
| "https://globo.com", | |
| "https://fandom.com", | |
| "https://news.yahoo.co.jp", | |
| "https://t.me", | |
| "https://samsung.com", | |
| "https://mail.ru", | |
| "https://duckduckgo.com", | |
| "https://nytimes.com", | |
| "https://ebay.com", | |
| "https://zoom.us", | |
| "https://discord.com", | |
| "https://github.com", | |
| "https://booking.com", | |
| "https://spotify.com", | |
| "https://cricbuzz.com", | |
| "https://instructure.com", | |
| "https://docomo.ne.jp", | |
| "https://roblox.com", | |
| "https://aliexpress.com", | |
| "https://bbc.com", | |
| "https://bbc.co.uk", | |
| "https://ozon.ru", | |
| "https://apple.com", | |
| "https://imdb.com", | |
| "https://telegram.org", | |
| "https://brave.com", | |
| "https://amazon.in", | |
| "https://msn.com", | |
| "https://walmart.com", | |
| "https://amazon.co.jp", | |
| "https://paypal.com", | |
| "https://cnn.com", | |
| "https://ya.ru", | |
| "https://indeed.com", | |
| "https://etsy.com", | |
| "https://rakuten.co.jp", | |
| "https://amazon.de", | |
| "https://espn.com", | |
| "https://hbomax.com", | |
| "https://usps.com", | |
| "https://music.youtube.com", | |
| "https://ok.ru", | |
| "https://wildberries.ru", | |
| "https://office365.com", | |
| "https://disneyplus.com", | |
| "https://douyin.com", | |
| "https://namu.wiki", | |
| "https://adobe.com", | |
| "https://shein.com", | |
| "https://qq.com", | |
| "https://amazon.co.uk", | |
| "https://quora.com", | |
| "https://rutube.ru", | |
| "https://theguardian.com", | |
| "https://scribd.com", | |
| "https://grok.com", | |
| "https://zillow.com", | |
| "https://dcinside.com", | |
| # Replacements for removed NSFW entries | |
| "https://stackoverflow.com", | |
| "https://medium.com", | |
| "https://notion.so", | |
| "https://figma.com", | |
| "https://dropbox.com", | |
| "https://slack.com", | |
| "https://trello.com", | |
| "https://shopify.com", | |
| "https://target.com", | |
| "https://ikea.com", | |
| ] | |