import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import time

BASE_URL = "https://community.jupiter.money/c/help/27"
CATEGORY_URL = f"{BASE_URL}/c/help/27.json"  # JSON endpoint of the Help category

def fetch_topic_urls():
    res = requests.get(CATEGORY_URL)
    data = res.json()
    topic_urls = [f"{BASE_URL}/t/{topic['slug']}/{topic['id']}" for topic in data['topic_list']['topics']]
    return topic_urls

def scrape_topic(url):
    topic_id = url.split("/")[-1]
    topic_json_url = f"https://community.jupiter.money/t/{topic_id}.json"
    
    res = requests.get(topic_json_url)
    data = res.json()

    question = data['title']
    posts = data['post_stream']['posts']

    if not posts or len(posts) == 0:
        return None

    # First post = original question or context
    first_post = posts[0]['cooked']
    # Next post = usually the first answer
    if len(posts) > 1:
        answer_post = posts[1]['cooked']
    else:
        answer_post = first_post

    # Remove HTML tags
    from bs4 import BeautifulSoup
    q_clean = BeautifulSoup(first_post, "html.parser").get_text()
    a_clean = BeautifulSoup(answer_post, "html.parser").get_text()

    return {
        "url": url,
        "question": question,
        "context": q_clean.strip(),
        "answer": a_clean.strip()
    }


def main():
    topic_urls = fetch_topic_urls()
    print(f"Found {len(topic_urls)} topics.")

    faqs = []
    for url in tqdm(topic_urls):
        try:
            faq = scrape_topic(url)
            if faq:
                faqs.append(faq)
            time.sleep(1)  # Avoid hitting rate limits
        except Exception as e:
            print(f"Error scraping {url}: {e}")

    # Save as JSON
    with open("jupiter_help_faqs.json", "w", encoding="utf-8") as f:
        json.dump(faqs, f, indent=2, ensure_ascii=False)

    print("✅ Scraping complete. Saved to jupiter_help_faqs.json")

if __name__ == "__main__":
    main()