import requests from bs4 import BeautifulSoup import json from tqdm import tqdm import time BASE_URL = "https://community.jupiter.money/c/help/27" CATEGORY_URL = f"{BASE_URL}/c/help/27.json" # JSON endpoint of the Help category def fetch_topic_urls(): res = requests.get(CATEGORY_URL) data = res.json() topic_urls = [f"{BASE_URL}/t/{topic['slug']}/{topic['id']}" for topic in data['topic_list']['topics']] return topic_urls def scrape_topic(url): topic_id = url.split("/")[-1] topic_json_url = f"https://community.jupiter.money/t/{topic_id}.json" res = requests.get(topic_json_url) data = res.json() question = data['title'] posts = data['post_stream']['posts'] if not posts or len(posts) == 0: return None # First post = original question or context first_post = posts[0]['cooked'] # Next post = usually the first answer if len(posts) > 1: answer_post = posts[1]['cooked'] else: answer_post = first_post # Remove HTML tags from bs4 import BeautifulSoup q_clean = BeautifulSoup(first_post, "html.parser").get_text() a_clean = BeautifulSoup(answer_post, "html.parser").get_text() return { "url": url, "question": question, "context": q_clean.strip(), "answer": a_clean.strip() } def main(): topic_urls = fetch_topic_urls() print(f"Found {len(topic_urls)} topics.") faqs = [] for url in tqdm(topic_urls): try: faq = scrape_topic(url) if faq: faqs.append(faq) time.sleep(1) # Avoid hitting rate limits except Exception as e: print(f"Error scraping {url}: {e}") # Save as JSON with open("jupiter_help_faqs.json", "w", encoding="utf-8") as f: json.dump(faqs, f, indent=2, ensure_ascii=False) print("✅ Scraping complete. Saved to jupiter_help_faqs.json") if __name__ == "__main__": main()